Unverified Commit 415e9a09 authored by Matt's avatar Matt Committed by GitHub
Browse files

Add tf_keras imports to prepare for Keras 3 (#28588)

* Port core files + ESM (because ESM code is odd)

* Search-replace in modelling code

* Fix up transfo_xl as well

* Fix other core files + tests (still need to add correct import to tests)

* Fix cookiecutter

* make fixup, fix imports in some more core files

* Auto-add imports to tests

* Cleanup, add imports to sagemaker tests

* Use correct exception for importing tf_keras

* Fixes in modeling_tf_utils

* make fixup

* Correct version parsing code

* Ensure the pipeline tests correctly revert to float32 after each test

* Ensure the pipeline tests correctly revert to float32 after each test

* More tf.keras -> keras

* Add dtype cast

* Better imports of tf_keras

* Add a cast for tf.assign, just in case

* Fix callback imports
parent 1d489b3e
...@@ -69,7 +69,6 @@ TensorFlowの[model.save](https://www.tensorflow.org/tutorials/keras/save_and_lo ...@@ -69,7 +69,6 @@ TensorFlowの[model.save](https://www.tensorflow.org/tutorials/keras/save_and_lo
```py ```py
>>> from transformers import TFPreTrainedModel >>> from transformers import TFPreTrainedModel
>>> from tensorflow import keras
>>> model.save_weights("some_folder/tf_model.h5") >>> model.save_weights("some_folder/tf_model.h5")
>>> model = TFPreTrainedModel.from_pretrained("some_folder") >>> model = TFPreTrainedModel.from_pretrained("some_folder")
......
...@@ -47,6 +47,7 @@ from transformers import ( ...@@ -47,6 +47,7 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.keras_callbacks import KerasMetricCallback from transformers.keras_callbacks import KerasMetricCallback
from transformers.modeling_tf_utils import keras
from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
...@@ -363,7 +364,7 @@ def main(): ...@@ -363,7 +364,7 @@ def main():
def _train_transforms(image): def _train_transforms(image):
img_size = image_size img_size = image_size
image = tf.keras.utils.img_to_array(image) image = keras.utils.img_to_array(image)
image = random_resized_crop(image, size=img_size) image = random_resized_crop(image, size=img_size)
image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_left_right(image)
image /= 255.0 image /= 255.0
...@@ -372,7 +373,7 @@ def main(): ...@@ -372,7 +373,7 @@ def main():
return image return image
def _val_transforms(image): def _val_transforms(image):
image = tf.keras.utils.img_to_array(image) image = keras.utils.img_to_array(image)
image = tf.image.resize(image, size=image_size) image = tf.image.resize(image, size=image_size)
# image = np.array(image) # FIXME - use tf.image function # image = np.array(image) # FIXME - use tf.image function
image = center_crop(image, size=image_size) image = center_crop(image, size=image_size)
......
...@@ -22,6 +22,7 @@ import os ...@@ -22,6 +22,7 @@ import os
import re import re
import tensorflow as tf import tensorflow as tf
from packaging.version import parse
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
...@@ -33,6 +34,19 @@ from transformers import ( ...@@ -33,6 +34,19 @@ from transformers import (
) )
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
AUTO = tf.data.AUTOTUNE AUTO = tf.data.AUTOTUNE
...@@ -209,7 +223,7 @@ def main(args): ...@@ -209,7 +223,7 @@ def main(args):
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
if args.bfloat16: if args.bfloat16:
tf.keras.mixed_precision.set_global_policy("mixed_bfloat16") keras.mixed_precision.set_global_policy("mixed_bfloat16")
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
config = AutoConfig.from_pretrained(args.pretrained_model_config) config = AutoConfig.from_pretrained(args.pretrained_model_config)
......
...@@ -30,6 +30,7 @@ from typing import Optional ...@@ -30,6 +30,7 @@ from typing import Optional
import evaluate import evaluate
import tensorflow as tf import tensorflow as tf
from datasets import load_dataset from datasets import load_dataset
from packaging.version import parse
from utils_qa import postprocess_qa_predictions from utils_qa import postprocess_qa_predictions
import transformers import transformers
...@@ -48,6 +49,19 @@ from transformers import ( ...@@ -48,6 +49,19 @@ from transformers import (
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.38.0.dev0") check_min_version("4.38.0.dev0")
...@@ -233,7 +247,7 @@ class DataTrainingArguments: ...@@ -233,7 +247,7 @@ class DataTrainingArguments:
# region Helper classes # region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback): class SavePretrainedCallback(keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch. # that saves the model with this method after each epoch.
......
...@@ -23,6 +23,20 @@ from unittest import skip ...@@ -23,6 +23,20 @@ from unittest import skip
from unittest.mock import patch from unittest.mock import patch
import tensorflow as tf import tensorflow as tf
from packaging.version import parse
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
from transformers.testing_utils import TestCasePlus, get_gpu_count, slow from transformers.testing_utils import TestCasePlus, get_gpu_count, slow
...@@ -115,7 +129,7 @@ class ExamplesTests(TestCasePlus): ...@@ -115,7 +129,7 @@ class ExamplesTests(TestCasePlus):
with patch.object(sys, "argv", testargs): with patch.object(sys, "argv", testargs):
run_text_classification.main() run_text_classification.main()
# Reset the mixed precision policy so we don't break other tests # Reset the mixed precision policy so we don't break other tests
tf.keras.mixed_precision.set_global_policy("float32") keras.mixed_precision.set_global_policy("float32")
result = get_results(tmp_dir) result = get_results(tmp_dir)
self.assertGreaterEqual(result["eval_accuracy"], 0.75) self.assertGreaterEqual(result["eval_accuracy"], 0.75)
......
...@@ -27,6 +27,7 @@ from typing import Optional ...@@ -27,6 +27,7 @@ from typing import Optional
import numpy as np import numpy as np
from datasets import load_dataset from datasets import load_dataset
from packaging.version import parse
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
...@@ -46,11 +47,24 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Reduce the amount of console output ...@@ -46,11 +47,24 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Reduce the amount of console output
import tensorflow as tf # noqa: E402 import tensorflow as tf # noqa: E402
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# region Helper classes # region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback): class SavePretrainedCallback(keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch. # that saves the model with this method after each epoch.
......
...@@ -15,7 +15,20 @@ ...@@ -15,7 +15,20 @@
import math import math
import tensorflow as tf import tensorflow as tf
from packaging import version from packaging.version import parse
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
def _gelu(x): def _gelu(x):
...@@ -99,12 +112,12 @@ def glu(x, axis=-1): ...@@ -99,12 +112,12 @@ def glu(x, axis=-1):
return a * tf.math.sigmoid(b) return a * tf.math.sigmoid(b)
if version.parse(tf.version.VERSION) >= version.parse("2.4"): if parse(tf.version.VERSION) >= parse("2.4"):
def approximate_gelu_wrap(x): def approximate_gelu_wrap(x):
return tf.keras.activations.gelu(x, approximate=True) return keras.activations.gelu(x, approximate=True)
gelu = tf.keras.activations.gelu gelu = keras.activations.gelu
gelu_new = approximate_gelu_wrap gelu_new = approximate_gelu_wrap
else: else:
gelu = _gelu gelu = _gelu
...@@ -119,11 +132,11 @@ ACT2FN = { ...@@ -119,11 +132,11 @@ ACT2FN = {
"glu": glu, "glu": glu,
"mish": mish, "mish": mish,
"quick_gelu": quick_gelu, "quick_gelu": quick_gelu,
"relu": tf.keras.activations.relu, "relu": keras.activations.relu,
"sigmoid": tf.keras.activations.sigmoid, "sigmoid": keras.activations.sigmoid,
"silu": tf.keras.activations.swish, "silu": keras.activations.swish,
"swish": tf.keras.activations.swish, "swish": keras.activations.swish,
"tanh": tf.keras.activations.tanh, "tanh": keras.activations.tanh,
} }
......
...@@ -8,16 +8,16 @@ import numpy as np ...@@ -8,16 +8,16 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from huggingface_hub import Repository, create_repo from huggingface_hub import Repository, create_repo
from packaging.version import parse from packaging.version import parse
from tensorflow.keras.callbacks import Callback
from . import IntervalStrategy, PreTrainedTokenizerBase from . import IntervalStrategy, PreTrainedTokenizerBase
from .modelcard import TrainingSummary from .modelcard import TrainingSummary
from .modeling_tf_utils import keras
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class KerasMetricCallback(Callback): class KerasMetricCallback(keras.callbacks.Callback):
""" """
Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be
compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string
...@@ -265,7 +265,7 @@ class KerasMetricCallback(Callback): ...@@ -265,7 +265,7 @@ class KerasMetricCallback(Callback):
logs.update(metric_output) logs.update(metric_output)
class PushToHubCallback(Callback): class PushToHubCallback(keras.callbacks.Callback):
""" """
Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can
be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such
......
...@@ -704,7 +704,7 @@ class TrainingSummary: ...@@ -704,7 +704,7 @@ class TrainingSummary:
def parse_keras_history(logs): def parse_keras_history(logs):
""" """
Parse the `logs` of either a `tf.keras.History` object returned by `model.fit()` or an accumulated logs `dict` Parse the `logs` of either a `keras.History` object returned by `model.fit()` or an accumulated logs `dict`
passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`. passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`.
""" """
if hasattr(logs, "history"): if hasattr(logs, "history"):
...@@ -800,14 +800,14 @@ def parse_log_history(log_history): ...@@ -800,14 +800,14 @@ def parse_log_history(log_history):
def extract_hyperparameters_from_keras(model): def extract_hyperparameters_from_keras(model):
import tensorflow as tf from .modeling_tf_utils import keras
hyperparameters = {} hyperparameters = {}
if hasattr(model, "optimizer") and model.optimizer is not None: if hasattr(model, "optimizer") and model.optimizer is not None:
hyperparameters["optimizer"] = model.optimizer.get_config() hyperparameters["optimizer"] = model.optimizer.get_config()
else: else:
hyperparameters["optimizer"] = None hyperparameters["optimizer"] = None
hyperparameters["training_precision"] = tf.keras.mixed_precision.global_policy().name hyperparameters["training_precision"] = keras.mixed_precision.global_policy().name
return hyperparameters return hyperparameters
......
...@@ -260,7 +260,6 @@ def load_pytorch_state_dict_in_tf2_model( ...@@ -260,7 +260,6 @@ def load_pytorch_state_dict_in_tf2_model(
"""Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
safetensors archive created with the safe_open() function.""" safetensors archive created with the safe_open() function."""
import tensorflow as tf import tensorflow as tf
from keras import backend as K
if tf_inputs is None: if tf_inputs is None:
tf_inputs = tf_model.dummy_inputs tf_inputs = tf_model.dummy_inputs
...@@ -360,7 +359,7 @@ def load_pytorch_state_dict_in_tf2_model( ...@@ -360,7 +359,7 @@ def load_pytorch_state_dict_in_tf2_model(
tf_loaded_numel += tensor_size(array) tf_loaded_numel += tensor_size(array)
K.set_value(symbolic_weight, array) symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype))
del array # Immediately free memory to keep peak usage as low as possible del array # Immediately free memory to keep peak usage as low as possible
all_pytorch_weights.discard(name) all_pytorch_weights.discard(name)
......
This diff is collapsed.
...@@ -44,6 +44,7 @@ from ...modeling_tf_utils import ( ...@@ -44,6 +44,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -84,9 +85,7 @@ class TFAlbertPreTrainingLoss: ...@@ -84,9 +85,7 @@ class TFAlbertPreTrainingLoss:
""" """
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
if self.config.tf_legacy_loss: if self.config.tf_legacy_loss:
# make sure only labels that are not equal to -100 # make sure only labels that are not equal to -100
# are taken into account as loss # are taken into account as loss
...@@ -133,7 +132,7 @@ class TFAlbertPreTrainingLoss: ...@@ -133,7 +132,7 @@ class TFAlbertPreTrainingLoss:
return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,)) return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
class TFAlbertEmbeddings(tf.keras.layers.Layer): class TFAlbertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -143,8 +142,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -143,8 +142,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -217,7 +216,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -217,7 +216,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFAlbertAttention(tf.keras.layers.Layer): class TFAlbertAttention(keras.layers.Layer):
"""Contains the complete attention sublayer, including both dropouts and layer norm.""" """Contains the complete attention sublayer, including both dropouts and layer norm."""
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -235,22 +234,22 @@ class TFAlbertAttention(tf.keras.layers.Layer): ...@@ -235,22 +234,22 @@ class TFAlbertAttention(tf.keras.layers.Layer):
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993 # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
...@@ -334,12 +333,12 @@ class TFAlbertAttention(tf.keras.layers.Layer): ...@@ -334,12 +333,12 @@ class TFAlbertAttention(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFAlbertLayer(tf.keras.layers.Layer): class TFAlbertLayer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.attention = TFAlbertAttention(config, name="attention") self.attention = TFAlbertAttention(config, name="attention")
self.ffn = tf.keras.layers.Dense( self.ffn = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
) )
...@@ -348,13 +347,13 @@ class TFAlbertLayer(tf.keras.layers.Layer): ...@@ -348,13 +347,13 @@ class TFAlbertLayer(tf.keras.layers.Layer):
else: else:
self.activation = config.hidden_act self.activation = config.hidden_act
self.ffn_output = tf.keras.layers.Dense( self.ffn_output = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
) )
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization( self.full_layer_layer_norm = keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="full_layer_layer_norm" epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call( def call(
...@@ -401,7 +400,7 @@ class TFAlbertLayer(tf.keras.layers.Layer): ...@@ -401,7 +400,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
self.full_layer_layer_norm.build([None, None, self.config.hidden_size]) self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
class TFAlbertLayerGroup(tf.keras.layers.Layer): class TFAlbertLayerGroup(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -453,7 +452,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): ...@@ -453,7 +452,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFAlbertTransformer(tf.keras.layers.Layer): class TFAlbertTransformer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -461,7 +460,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -461,7 +460,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
self.num_hidden_groups = config.num_hidden_groups self.num_hidden_groups = config.num_hidden_groups
# Number of layers in a hidden group # Number of layers in a hidden group
self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups) self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups)
self.embedding_hidden_mapping_in = tf.keras.layers.Dense( self.embedding_hidden_mapping_in = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="embedding_hidden_mapping_in", name="embedding_hidden_mapping_in",
...@@ -534,13 +533,13 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): ...@@ -534,13 +533,13 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "albert" base_model_prefix = "albert"
class TFAlbertMLMHead(tf.keras.layers.Layer): class TFAlbertMLMHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
...@@ -548,7 +547,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -548,7 +547,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
else: else:
self.activation = config.hidden_act self.activation = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
...@@ -570,7 +569,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -570,7 +569,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
with tf.name_scope(self.LayerNorm.name): with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size]) self.LayerNorm.build([None, None, self.config.embedding_size])
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> keras.layers.Layer:
return self.decoder return self.decoder
def set_output_embeddings(self, value: tf.Variable): def set_output_embeddings(self, value: tf.Variable):
...@@ -599,7 +598,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -599,7 +598,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFAlbertMainLayer(tf.keras.layers.Layer): class TFAlbertMainLayer(keras.layers.Layer):
config_class = AlbertConfig config_class = AlbertConfig
def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs): def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs):
...@@ -610,7 +609,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -610,7 +609,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder") self.encoder = TFAlbertTransformer(config, name="encoder")
self.pooler = ( self.pooler = (
tf.keras.layers.Dense( keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -620,7 +619,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -620,7 +619,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
else None else None
) )
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
def set_input_embeddings(self, value: tf.Variable): def set_input_embeddings(self, value: tf.Variable):
...@@ -776,7 +775,7 @@ ALBERT_START_DOCSTRING = r""" ...@@ -776,7 +775,7 @@ ALBERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -942,7 +941,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): ...@@ -942,7 +941,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.predictions return self.predictions
@unpack_inputs @unpack_inputs
...@@ -1032,12 +1031,12 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): ...@@ -1032,12 +1031,12 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
self.sop_classifier.build(None) self.sop_classifier.build(None)
class TFAlbertSOPHead(tf.keras.layers.Layer): class TFAlbertSOPHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -1070,7 +1069,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -1070,7 +1069,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.predictions return self.predictions
@unpack_inputs @unpack_inputs
...@@ -1184,8 +1183,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -1184,8 +1183,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1283,8 +1282,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1283,8 +1282,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
if config.classifier_dropout_prob is not None if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout_prob) self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1372,7 +1371,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1372,7 +1371,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
...@@ -1478,8 +1477,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1478,8 +1477,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
......
...@@ -38,6 +38,7 @@ from ...modeling_tf_utils import ( ...@@ -38,6 +38,7 @@ from ...modeling_tf_utils import (
TFModelInputType, TFModelInputType,
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
return (one_cst - expanded_mask) * LARGE_NEGATIVE return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): class TFBartLearnedPositionalEmbedding(keras.layers.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
""" """
...@@ -143,7 +144,7 @@ class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): ...@@ -143,7 +144,7 @@ class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding):
return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype)) return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype))
class TFBartAttention(tf.keras.layers.Layer): class TFBartAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""" """Multi-headed attention from "Attention Is All You Need"""
def __init__( def __init__(
...@@ -159,7 +160,7 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -159,7 +160,7 @@ class TFBartAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim: if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError( raise ValueError(
...@@ -169,10 +170,10 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -169,10 +170,10 @@ class TFBartAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
...@@ -313,20 +314,20 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -313,20 +314,20 @@ class TFBartAttention(tf.keras.layers.Layer):
self.out_proj.build([None, None, self.embed_dim]) self.out_proj.build([None, None, self.embed_dim])
class TFBartEncoderLayer(tf.keras.layers.Layer): class TFBartEncoderLayer(keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs): def __init__(self, config: BartConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
self.self_attn = TFBartAttention( self.self_attn = TFBartAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
) )
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -390,7 +391,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer): ...@@ -390,7 +391,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
self.final_layer_norm.build([None, None, self.embed_dim]) self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartDecoderLayer(tf.keras.layers.Layer): class TFBartDecoderLayer(keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs): def __init__(self, config: BartConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
...@@ -401,11 +402,11 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -401,11 +402,11 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
name="self_attn", name="self_attn",
is_decoder=True, is_decoder=True,
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBartAttention( self.encoder_attn = TFBartAttention(
self.embed_dim, self.embed_dim,
config.decoder_attention_heads, config.decoder_attention_heads,
...@@ -413,10 +414,10 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -413,10 +414,10 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn", name="encoder_attn",
is_decoder=True, is_decoder=True,
) )
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -526,21 +527,21 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -526,21 +527,21 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
self.final_layer_norm.build([None, None, self.embed_dim]) self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartClassificationHead(tf.keras.layers.Layer): class TFBartClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs): def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs):
super().__init__(name=name, **kwargs) super().__init__(name=name, **kwargs)
self.dense = tf.keras.layers.Dense(inner_dim, name="dense") self.dense = keras.layers.Dense(inner_dim, name="dense")
self.dropout = tf.keras.layers.Dropout(pooler_dropout) self.dropout = keras.layers.Dropout(pooler_dropout)
self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj") self.out_proj = keras.layers.Dense(num_classes, name="out_proj")
self.input_dim = inner_dim self.input_dim = inner_dim
self.inner_dim = inner_dim self.inner_dim = inner_dim
def call(self, inputs): def call(self, inputs):
hidden_states = self.dropout(inputs) hidden_states = self.dropout(inputs)
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
hidden_states = tf.keras.activations.tanh(hidden_states) hidden_states = keras.activations.tanh(hidden_states)
hidden_states = self.dropout(hidden_states) hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states) hidden_states = self.out_proj(hidden_states)
return hidden_states return hidden_states
...@@ -583,7 +584,7 @@ BART_START_DOCSTRING = r""" ...@@ -583,7 +584,7 @@ BART_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -740,7 +741,7 @@ BART_INPUTS_DOCSTRING = r""" ...@@ -740,7 +741,7 @@ BART_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBartEncoder(tf.keras.layers.Layer): class TFBartEncoder(keras.layers.Layer):
config_class = BartConfig config_class = BartConfig
""" """
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
...@@ -750,10 +751,10 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -750,10 +751,10 @@ class TFBartEncoder(tf.keras.layers.Layer):
config: BartConfig config: BartConfig
""" """
def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings self.max_source_positions = config.max_position_embeddings
...@@ -766,7 +767,7 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -766,7 +767,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
name="embed_positions", name="embed_positions",
) )
self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model self.embed_dim = config.d_model
@unpack_inputs @unpack_inputs
...@@ -900,7 +901,7 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -900,7 +901,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBartDecoder(tf.keras.layers.Layer): class TFBartDecoder(keras.layers.Layer):
config_class = BartConfig config_class = BartConfig
""" """
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`] Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`]
...@@ -910,7 +911,7 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -910,7 +911,7 @@ class TFBartDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding embed_tokens: output embedding
""" """
def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
...@@ -923,9 +924,9 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -923,9 +924,9 @@ class TFBartDecoder(tf.keras.layers.Layer):
) )
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
@unpack_inputs @unpack_inputs
def call( def call(
...@@ -1130,16 +1131,16 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -1130,16 +1131,16 @@ class TFBartDecoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBartMainLayer(tf.keras.layers.Layer): class TFBartMainLayer(keras.layers.Layer):
config_class = BartConfig config_class = BartConfig
def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs): def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.shared = tf.keras.layers.Embedding( self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.d_model, output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared", name="model.shared",
) )
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
...@@ -1358,9 +1359,9 @@ class TFBartModel(TFBartPretrainedModel): ...@@ -1358,9 +1359,9 @@ class TFBartModel(TFBartPretrainedModel):
self.model.build(None) self.model.build(None)
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
......
...@@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name ...@@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name
if any(x in var_name for x in tensors_to_transpose): if any(x in var_name for x in tensors_to_transpose):
torch_tensor = torch_tensor.T torch_tensor = torch_tensor.T
tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
tf.keras.backend.set_value(tf_var, torch_tensor) tf_var.assign(tf.cast(torch_tensor, tf_var.dtype))
tf_weight = session.run(tf_var) tf_weight = session.run(tf_var)
print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}") print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}")
......
...@@ -49,6 +49,7 @@ from ...modeling_tf_utils import ( ...@@ -49,6 +49,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -121,9 +122,7 @@ class TFBertPreTrainingLoss: ...@@ -121,9 +122,7 @@ class TFBertPreTrainingLoss:
""" """
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0]) unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0])
...@@ -143,7 +142,7 @@ class TFBertPreTrainingLoss: ...@@ -143,7 +142,7 @@ class TFBertPreTrainingLoss:
return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,)) return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,))
class TFBertEmbeddings(tf.keras.layers.Layer): class TFBertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -153,8 +152,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -153,8 +152,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -226,7 +225,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -226,7 +225,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfAttention(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -241,16 +240,16 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -241,16 +240,16 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.config = config self.config = config
...@@ -358,15 +357,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -358,15 +357,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.value.build([None, None, self.config.hidden_size]) self.value.build([None, None, self.config.hidden_size])
class TFBertSelfOutput(tf.keras.layers.Layer): class TFBertSelfOutput(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -388,7 +387,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer): ...@@ -388,7 +387,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertAttention(tf.keras.layers.Layer): class TFBertAttention(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -439,11 +438,11 @@ class TFBertAttention(tf.keras.layers.Layer): ...@@ -439,11 +438,11 @@ class TFBertAttention(tf.keras.layers.Layer):
self.dense_output.build(None) self.dense_output.build(None)
class TFBertIntermediate(tf.keras.layers.Layer): class TFBertIntermediate(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -468,15 +467,15 @@ class TFBertIntermediate(tf.keras.layers.Layer): ...@@ -468,15 +467,15 @@ class TFBertIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFBertOutput(tf.keras.layers.Layer): class TFBertOutput(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -498,7 +497,7 @@ class TFBertOutput(tf.keras.layers.Layer): ...@@ -498,7 +497,7 @@ class TFBertOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLayer(tf.keras.layers.Layer): class TFBertLayer(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -601,7 +600,7 @@ class TFBertLayer(tf.keras.layers.Layer): ...@@ -601,7 +600,7 @@ class TFBertLayer(tf.keras.layers.Layer):
self.crossattention.build(None) self.crossattention.build(None)
class TFBertEncoder(tf.keras.layers.Layer): class TFBertEncoder(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -679,11 +678,11 @@ class TFBertEncoder(tf.keras.layers.Layer): ...@@ -679,11 +678,11 @@ class TFBertEncoder(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFBertPooler(tf.keras.layers.Layer): class TFBertPooler(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -708,11 +707,11 @@ class TFBertPooler(tf.keras.layers.Layer): ...@@ -708,11 +707,11 @@ class TFBertPooler(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFBertPredictionHeadTransform(tf.keras.layers.Layer): class TFBertPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
...@@ -723,7 +722,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -723,7 +722,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
...@@ -745,8 +744,8 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -745,8 +744,8 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLMPredictionHead(tf.keras.layers.Layer): class TFBertLMPredictionHead(keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -768,7 +767,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -768,7 +767,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
with tf.name_scope(self.transform.name): with tf.name_scope(self.transform.name):
self.transform.build(None) self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable): def set_output_embeddings(self, value: tf.Variable):
...@@ -793,8 +792,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -793,8 +792,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFBertMLMHead(tf.keras.layers.Layer): class TFBertMLMHead(keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
...@@ -813,11 +812,11 @@ class TFBertMLMHead(tf.keras.layers.Layer): ...@@ -813,11 +812,11 @@ class TFBertMLMHead(tf.keras.layers.Layer):
self.predictions.build(None) self.predictions.build(None)
class TFBertNSPHead(tf.keras.layers.Layer): class TFBertNSPHead(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.seq_relationship = tf.keras.layers.Dense( self.seq_relationship = keras.layers.Dense(
units=2, units=2,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="seq_relationship", name="seq_relationship",
...@@ -839,7 +838,7 @@ class TFBertNSPHead(tf.keras.layers.Layer): ...@@ -839,7 +838,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBertMainLayer(tf.keras.layers.Layer): class TFBertMainLayer(keras.layers.Layer):
config_class = BertConfig config_class = BertConfig
def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs): def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs):
...@@ -852,7 +851,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -852,7 +851,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
self.encoder = TFBertEncoder(config, name="encoder") self.encoder = TFBertEncoder(config, name="encoder")
self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
def set_input_embeddings(self, value: tf.Variable): def set_input_embeddings(self, value: tf.Variable):
...@@ -1086,7 +1085,7 @@ BERT_START_DOCSTRING = r""" ...@@ -1086,7 +1085,7 @@ BERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -1281,7 +1280,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): ...@@ -1281,7 +1280,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
self.nsp = TFBertNSPHead(config, name="nsp___cls") self.nsp = TFBertNSPHead(config, name="nsp___cls")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
def get_prefix_bias_name(self) -> str: def get_prefix_bias_name(self) -> str:
...@@ -1407,7 +1406,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -1407,7 +1406,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
def get_prefix_bias_name(self) -> str: def get_prefix_bias_name(self) -> str:
...@@ -1500,7 +1499,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1500,7 +1499,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
def get_prefix_bias_name(self) -> str: def get_prefix_bias_name(self) -> str:
...@@ -1732,8 +1731,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -1732,8 +1731,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout) self.dropout = keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -1825,8 +1824,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1825,8 +1824,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1947,8 +1946,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1947,8 +1946,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout) self.dropout = keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -2045,7 +2044,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -2045,7 +2044,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs", name="qa_outputs",
......
...@@ -5,10 +5,11 @@ import tensorflow as tf ...@@ -5,10 +5,11 @@ import tensorflow as tf
from tensorflow_text import BertTokenizer as BertTokenizerLayer from tensorflow_text import BertTokenizer as BertTokenizerLayer
from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs
from ...modeling_tf_utils import keras
from .tokenization_bert import BertTokenizer from .tokenization_bert import BertTokenizer
class TFBertTokenizer(tf.keras.layers.Layer): class TFBertTokenizer(keras.layers.Layer):
""" """
This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
`from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
......
...@@ -36,6 +36,7 @@ from ...modeling_tf_outputs import ( ...@@ -36,6 +36,7 @@ from ...modeling_tf_outputs import (
from ...modeling_tf_utils import ( from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
return (one_cst - expanded_mask) * LARGE_NEGATIVE return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding): class TFBlenderbotLearnedPositionalEmbedding(keras.layers.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
""" """
...@@ -138,7 +139,7 @@ class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding): ...@@ -138,7 +139,7 @@ class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot
class TFBlenderbotAttention(tf.keras.layers.Layer): class TFBlenderbotAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""" """Multi-headed attention from "Attention Is All You Need"""
def __init__( def __init__(
...@@ -154,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -154,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim: if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError( raise ValueError(
...@@ -164,10 +165,10 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -164,10 +165,10 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
...@@ -309,20 +310,20 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -309,20 +310,20 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot
class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): class TFBlenderbotEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs): def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
self.self_attn = TFBlenderbotAttention( self.self_attn = TFBlenderbotAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
) )
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -387,7 +388,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): ...@@ -387,7 +388,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot
class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): class TFBlenderbotDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs): def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
...@@ -398,11 +399,11 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): ...@@ -398,11 +399,11 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
name="self_attn", name="self_attn",
is_decoder=True, is_decoder=True,
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotAttention( self.encoder_attn = TFBlenderbotAttention(
self.embed_dim, self.embed_dim,
config.decoder_attention_heads, config.decoder_attention_heads,
...@@ -410,10 +411,10 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): ...@@ -410,10 +411,10 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn", name="encoder_attn",
is_decoder=True, is_decoder=True,
) )
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -533,7 +534,7 @@ BLENDERBOT_START_DOCSTRING = r""" ...@@ -533,7 +534,7 @@ BLENDERBOT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -677,7 +678,7 @@ BLENDERBOT_INPUTS_DOCSTRING = r""" ...@@ -677,7 +678,7 @@ BLENDERBOT_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBlenderbotEncoder(tf.keras.layers.Layer): class TFBlenderbotEncoder(keras.layers.Layer):
config_class = BlenderbotConfig config_class = BlenderbotConfig
""" """
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
...@@ -687,10 +688,10 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -687,10 +688,10 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
config: BlenderbotConfig config: BlenderbotConfig
""" """
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings self.max_source_positions = config.max_position_embeddings
...@@ -703,7 +704,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -703,7 +704,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
name="embed_positions", name="embed_positions",
) )
self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -849,7 +850,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -849,7 +850,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotDecoder(tf.keras.layers.Layer): class TFBlenderbotDecoder(keras.layers.Layer):
config_class = BlenderbotConfig config_class = BlenderbotConfig
""" """
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`] Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`]
...@@ -859,7 +860,7 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -859,7 +860,7 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding embed_tokens: output embedding
""" """
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
...@@ -872,9 +873,9 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -872,9 +873,9 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
) )
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -1090,17 +1091,17 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -1090,17 +1091,17 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotMainLayer(tf.keras.layers.Layer): class TFBlenderbotMainLayer(keras.layers.Layer):
config_class = BlenderbotConfig config_class = BlenderbotConfig
def __init__(self, config: BlenderbotConfig, **kwargs): def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.shared = tf.keras.layers.Embedding( self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.d_model, output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared", name="model.shared",
) )
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
...@@ -1325,9 +1326,9 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel): ...@@ -1325,9 +1326,9 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel):
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
......
...@@ -35,6 +35,7 @@ from ...modeling_tf_outputs import ( ...@@ -35,6 +35,7 @@ from ...modeling_tf_outputs import (
from ...modeling_tf_utils import ( from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
# Copied from transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall # Copied from transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall
class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding): class TFBlenderbotSmallLearnedPositionalEmbedding(keras.layers.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
""" """
...@@ -138,7 +139,7 @@ class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding): ...@@ -138,7 +139,7 @@ class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->BlenderbotSmall
class TFBlenderbotSmallAttention(tf.keras.layers.Layer): class TFBlenderbotSmallAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""" """Multi-headed attention from "Attention Is All You Need"""
def __init__( def __init__(
...@@ -154,7 +155,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -154,7 +155,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim: if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError( raise ValueError(
...@@ -164,10 +165,10 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -164,10 +165,10 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
...@@ -309,20 +310,20 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -309,20 +310,20 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): class TFBlenderbotSmallEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs): def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
self.self_attn = TFBlenderbotSmallAttention( self.self_attn = TFBlenderbotSmallAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
) )
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -387,7 +388,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): ...@@ -387,7 +388,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): class TFBlenderbotSmallDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs): def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
...@@ -398,11 +399,11 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ...@@ -398,11 +399,11 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
name="self_attn", name="self_attn",
is_decoder=True, is_decoder=True,
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotSmallAttention( self.encoder_attn = TFBlenderbotSmallAttention(
self.embed_dim, self.embed_dim,
config.decoder_attention_heads, config.decoder_attention_heads,
...@@ -410,10 +411,10 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ...@@ -410,10 +411,10 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn", name="encoder_attn",
is_decoder=True, is_decoder=True,
) )
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -533,7 +534,7 @@ BLENDERBOT_SMALL_START_DOCSTRING = r""" ...@@ -533,7 +534,7 @@ BLENDERBOT_SMALL_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -681,7 +682,7 @@ BLENDERBOT_SMALL_INPUTS_DOCSTRING = r""" ...@@ -681,7 +682,7 @@ BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): class TFBlenderbotSmallEncoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig config_class = BlenderbotSmallConfig
""" """
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
...@@ -691,12 +692,10 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -691,12 +692,10 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
config: BlenderbotSmallConfig config: BlenderbotSmallConfig
""" """
def __init__( def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs
):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings self.max_source_positions = config.max_position_embeddings
...@@ -709,7 +708,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -709,7 +708,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
name="embed_positions", name="embed_positions",
) )
self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model self.embed_dim = config.d_model
def get_embed_tokens(self): def get_embed_tokens(self):
...@@ -855,7 +854,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -855,7 +854,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): class TFBlenderbotSmallDecoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig config_class = BlenderbotSmallConfig
""" """
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`] Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`]
...@@ -865,9 +864,7 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -865,9 +864,7 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding embed_tokens: output embedding
""" """
def __init__( def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs
):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
...@@ -880,9 +877,9 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -880,9 +877,9 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
) )
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -1095,17 +1092,17 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -1095,17 +1092,17 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer): class TFBlenderbotSmallMainLayer(keras.layers.Layer):
config_class = BlenderbotSmallConfig config_class = BlenderbotSmallConfig
def __init__(self, config: BlenderbotSmallConfig, **kwargs): def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.shared = tf.keras.layers.Embedding( self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.d_model, output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared", name="model.shared",
) )
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
...@@ -1314,9 +1311,9 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel): ...@@ -1314,9 +1311,9 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel):
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
......
...@@ -27,6 +27,7 @@ from ...modeling_tf_utils import ( ...@@ -27,6 +27,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
get_initializer, get_initializer,
get_tf_activation, get_tf_activation,
keras,
keras_serializable, keras_serializable,
shape_list, shape_list,
unpack_inputs, unpack_inputs,
...@@ -63,7 +64,7 @@ TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -63,7 +64,7 @@ TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss # Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
return tf.math.reduce_mean( return tf.math.reduce_mean(
tf.keras.metrics.sparse_categorical_crossentropy( keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
) )
) )
...@@ -234,7 +235,7 @@ class TFBlipOutput(ModelOutput): ...@@ -234,7 +235,7 @@ class TFBlipOutput(ModelOutput):
) )
class TFBlipVisionEmbeddings(tf.keras.layers.Layer): class TFBlipVisionEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipVisionConfig, **kwargs): def __init__(self, config: BlipVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -242,7 +243,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): ...@@ -242,7 +243,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
self.image_size = config.image_size self.image_size = config.image_size
self.patch_size = config.patch_size self.patch_size = config.patch_size
self.patch_embedding = tf.keras.layers.Conv2D( self.patch_embedding = keras.layers.Conv2D(
filters=self.embed_dim, filters=self.embed_dim,
kernel_size=self.patch_size, kernel_size=self.patch_size,
strides=self.patch_size, strides=self.patch_size,
...@@ -291,7 +292,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): ...@@ -291,7 +292,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip
class TFBlipTextEmbeddings(tf.keras.layers.Layer): class TFBlipTextEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -349,7 +350,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): ...@@ -349,7 +350,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFBlipAttention(tf.keras.layers.Layer): class TFBlipAttention(keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -364,13 +365,13 @@ class TFBlipAttention(tf.keras.layers.Layer): ...@@ -364,13 +365,13 @@ class TFBlipAttention(tf.keras.layers.Layer):
f" {self.num_heads})." f" {self.num_heads})."
) )
self.scale = self.head_dim**-0.5 self.scale = self.head_dim**-0.5
self.dropout = tf.keras.layers.Dropout(config.attention_dropout, name="dropout") self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout")
self.qkv = tf.keras.layers.Dense( self.qkv = keras.layers.Dense(
3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv" 3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
) )
self.projection = tf.keras.layers.Dense( self.projection = keras.layers.Dense(
self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
) )
...@@ -433,7 +434,7 @@ class TFBlipAttention(tf.keras.layers.Layer): ...@@ -433,7 +434,7 @@ class TFBlipAttention(tf.keras.layers.Layer):
self.projection.build([None, None, self.embed_dim]) self.projection.build([None, None, self.embed_dim])
class TFBlipMLP(tf.keras.layers.Layer): class TFBlipMLP(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs): def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -442,10 +443,10 @@ class TFBlipMLP(tf.keras.layers.Layer): ...@@ -442,10 +443,10 @@ class TFBlipMLP(tf.keras.layers.Layer):
in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5)
fc_std = (2 * config.hidden_size) ** -0.5 fc_std = (2 * config.hidden_size) ** -0.5
self.fc1 = tf.keras.layers.Dense( self.fc1 = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
) )
self.fc2 = tf.keras.layers.Dense( self.fc2 = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
) )
self.config = config self.config = config
...@@ -468,14 +469,14 @@ class TFBlipMLP(tf.keras.layers.Layer): ...@@ -468,14 +469,14 @@ class TFBlipMLP(tf.keras.layers.Layer):
self.fc2.build([None, None, self.config.intermediate_size]) self.fc2.build([None, None, self.config.intermediate_size])
class TFBlipEncoderLayer(tf.keras.layers.Layer): class TFBlipEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs): def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = TFBlipAttention(config, name="self_attn") self.self_attn = TFBlipAttention(config, name="self_attn")
self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFBlipMLP(config, name="mlp") self.mlp = TFBlipMLP(config, name="mlp")
self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call( def call(
self, self,
...@@ -551,7 +552,7 @@ BLIP_START_DOCSTRING = r""" ...@@ -551,7 +552,7 @@ BLIP_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -614,7 +615,7 @@ BLIP_INPUTS_DOCSTRING = r""" ...@@ -614,7 +615,7 @@ BLIP_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBlipEncoder(tf.keras.layers.Layer): class TFBlipEncoder(keras.layers.Layer):
config_class = BlipConfig config_class = BlipConfig
""" """
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
...@@ -714,7 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): ...@@ -714,7 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings") self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
self.encoder = TFBlipEncoder(config, name="encoder") self.encoder = TFBlipEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
...@@ -798,7 +799,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): ...@@ -798,7 +799,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.post_layernorm.build([None, None, self.embed_dim]) self.post_layernorm.build([None, None, self.embed_dim])
class TFBlipMainLayer(tf.keras.layers.Layer): class TFBlipMainLayer(keras.layers.Layer):
config_class = BlipConfig config_class = BlipConfig
def __init__(self, config: BlipConfig, *args, **kwargs): def __init__(self, config: BlipConfig, *args, **kwargs):
...@@ -826,13 +827,13 @@ class TFBlipMainLayer(tf.keras.layers.Layer): ...@@ -826,13 +827,13 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
self.text_model = TFBlipTextModel(text_config, name="text_model") self.text_model = TFBlipTextModel(text_config, name="text_model")
self.vision_model = TFBlipVisionModel(vision_config, name="vision_model") self.vision_model = TFBlipVisionModel(vision_config, name="vision_model")
self.visual_projection = tf.keras.layers.Dense( self.visual_projection = keras.layers.Dense(
self.projection_dim, self.projection_dim,
use_bias=False, use_bias=False,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="visual_projection", name="visual_projection",
) )
self.text_projection = tf.keras.layers.Dense( self.text_projection = keras.layers.Dense(
self.projection_dim, self.projection_dim,
use_bias=False, use_bias=False,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
...@@ -845,7 +846,7 @@ class TFBlipMainLayer(tf.keras.layers.Layer): ...@@ -845,7 +846,7 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
self.logit_scale = self.add_weight( self.logit_scale = self.add_weight(
name="logit_scale", name="logit_scale",
shape=[], shape=[],
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True, trainable=True,
) )
...@@ -1116,7 +1117,7 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel): ...@@ -1116,7 +1117,7 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
self.decoder_input_ids = config.text_config.bos_token_id self.decoder_input_ids = config.text_config.bos_token_id
self.decoder_pad_token_id = config.text_config.pad_token_id self.decoder_pad_token_id = config.text_config.pad_token_id
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
@unpack_inputs @unpack_inputs
...@@ -1307,7 +1308,7 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel): ...@@ -1307,7 +1308,7 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
self.decoder_pad_token_id = config.text_config.pad_token_id self.decoder_pad_token_id = config.text_config.pad_token_id
self.decoder_start_token_id = config.text_config.bos_token_id self.decoder_start_token_id = config.text_config.bos_token_id
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
# Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right
...@@ -1557,21 +1558,21 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): ...@@ -1557,21 +1558,21 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
# vision projection layer # vision projection layer
self.vision_proj = tf.keras.layers.Dense( self.vision_proj = keras.layers.Dense(
config.image_text_hidden_size, config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="vision_proj", name="vision_proj",
) )
# text projection layer # text projection layer
self.text_proj = tf.keras.layers.Dense( self.text_proj = keras.layers.Dense(
config.image_text_hidden_size, config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="text_proj", name="text_proj",
) )
# image text matching head # image text matching head
self.itm_head = tf.keras.layers.Dense( self.itm_head = keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head" 2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head"
) )
...@@ -1587,7 +1588,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): ...@@ -1587,7 +1588,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
) )
self.config = config self.config = config
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
@unpack_inputs @unpack_inputs
......
...@@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( ...@@ -31,6 +31,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
get_initializer, get_initializer,
get_tf_activation, get_tf_activation,
keras,
keras_serializable, keras_serializable,
shape_list, shape_list,
unpack_inputs, unpack_inputs,
...@@ -75,18 +76,18 @@ BLIP_TEXT_INPUTS_DOCSTRING = r""" ...@@ -75,18 +76,18 @@ BLIP_TEXT_INPUTS_DOCSTRING = r"""
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52
class TFBlipTextEmbeddings(tf.keras.layers.Layer): class TFBlipTextEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word and position embeddings.""" """Construct the embeddings from word and position embeddings."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.word_embeddings = tf.keras.layers.Embedding( self.word_embeddings = keras.layers.Embedding(
config.vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range), embeddings_initializer=get_initializer(config.initializer_range),
name="word_embeddings", name="word_embeddings",
) )
self.position_embeddings = tf.keras.layers.Embedding( self.position_embeddings = keras.layers.Embedding(
config.max_position_embeddings, config.max_position_embeddings,
config.hidden_size, config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range), embeddings_initializer=get_initializer(config.initializer_range),
...@@ -95,8 +96,8 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): ...@@ -95,8 +96,8 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
# self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load
# any TensorFlow checkpoint file # any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0) self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
...@@ -146,7 +147,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): ...@@ -146,7 +147,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
class TFBlipTextSelfAttention(tf.keras.layers.Layer): class TFBlipTextSelfAttention(keras.layers.Layer):
def __init__(self, config, is_cross_attention, **kwargs): def __init__(self, config, is_cross_attention, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -160,21 +161,21 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): ...@@ -160,21 +161,21 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = tf.keras.layers.Embedding( self.distance_embedding = keras.layers.Embedding(
2 * config.max_position_embeddings - 1, self.attention_head_size 2 * config.max_position_embeddings - 1, self.attention_head_size
) )
self.is_cross_attention = is_cross_attention self.is_cross_attention = is_cross_attention
...@@ -291,15 +292,15 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): ...@@ -291,15 +292,15 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.value.build([None, None, self.config.hidden_size]) self.value.build([None, None, self.config.hidden_size])
class TFBlipTextSelfOutput(tf.keras.layers.Layer): class TFBlipTextSelfOutput(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
...@@ -322,7 +323,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer): ...@@ -322,7 +323,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
class TFBlipTextAttention(tf.keras.layers.Layer): class TFBlipTextAttention(keras.layers.Layer):
def __init__(self, config, is_cross_attention=False, **kwargs): def __init__(self, config, is_cross_attention=False, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self") self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self")
...@@ -367,11 +368,11 @@ class TFBlipTextAttention(tf.keras.layers.Layer): ...@@ -367,11 +368,11 @@ class TFBlipTextAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
class TFBlipTextIntermediate(tf.keras.layers.Layer): class TFBlipTextIntermediate(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -396,15 +397,15 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer): ...@@ -396,15 +397,15 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFBlipTextOutput(tf.keras.layers.Layer): class TFBlipTextOutput(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -426,7 +427,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer): ...@@ -426,7 +427,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLayer(tf.keras.layers.Layer): class TFBlipTextLayer(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -504,7 +505,7 @@ class TFBlipTextLayer(tf.keras.layers.Layer): ...@@ -504,7 +505,7 @@ class TFBlipTextLayer(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
@keras_serializable @keras_serializable
class TFBlipTextEncoder(tf.keras.layers.Layer): class TFBlipTextEncoder(keras.layers.Layer):
config_class = BlipTextConfig config_class = BlipTextConfig
def __init__(self, config, name=None, **kwargs): def __init__(self, config, name=None, **kwargs):
...@@ -593,11 +594,11 @@ class TFBlipTextEncoder(tf.keras.layers.Layer): ...@@ -593,11 +594,11 @@ class TFBlipTextEncoder(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
class TFBlipTextPooler(tf.keras.layers.Layer): class TFBlipTextPooler(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -623,11 +624,11 @@ class TFBlipTextPooler(tf.keras.layers.Layer): ...@@ -623,11 +624,11 @@ class TFBlipTextPooler(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): class TFBlipTextPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
...@@ -638,7 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -638,7 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
...@@ -660,14 +661,14 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -660,14 +661,14 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): class TFBlipTextLMPredictionHead(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.transform = TFBlipTextPredictionHeadTransform(config, name="transform") self.transform = TFBlipTextPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
self.decoder = tf.keras.layers.Dense( self.decoder = keras.layers.Dense(
config.vocab_size, config.vocab_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="decoder", name="decoder",
...@@ -694,7 +695,7 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): ...@@ -694,7 +695,7 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer): class TFBlipTextOnlyMLMHead(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFBlipTextLMPredictionHead(config, name="predictions") self.predictions = TFBlipTextLMPredictionHead(config, name="predictions")
...@@ -1062,7 +1063,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): ...@@ -1062,7 +1063,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
# Keras won't give us label smoothing for sparse CE, so we de-sparsify things here # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
# Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway)
one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32)
loss_fct = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none") loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none")
masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
lm_loss *= masked_positions lm_loss *= masked_positions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment