Unverified Commit 415e9a09 authored by Matt's avatar Matt Committed by GitHub
Browse files

Add tf_keras imports to prepare for Keras 3 (#28588)

* Port core files + ESM (because ESM code is odd)

* Search-replace in modelling code

* Fix up transfo_xl as well

* Fix other core files + tests (still need to add correct import to tests)

* Fix cookiecutter

* make fixup, fix imports in some more core files

* Auto-add imports to tests

* Cleanup, add imports to sagemaker tests

* Use correct exception for importing tf_keras

* Fixes in modeling_tf_utils

* make fixup

* Correct version parsing code

* Ensure the pipeline tests correctly revert to float32 after each test

* Ensure the pipeline tests correctly revert to float32 after each test

* More tf.keras -> keras

* Add dtype cast

* Better imports of tf_keras

* Add a cast for tf.assign, just in case

* Fix callback imports
parent 1d489b3e
......@@ -46,6 +46,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -75,7 +76,7 @@ CAMEMBERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -168,7 +169,7 @@ CAMEMBERT_INPUTS_DOCSTRING = r"""
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings
class TFCamembertEmbeddings(tf.keras.layers.Layer):
class TFCamembertEmbeddings(keras.layers.Layer):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
......@@ -181,8 +182,8 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
......@@ -274,11 +275,11 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert
class TFCamembertPooler(tf.keras.layers.Layer):
class TFCamembertPooler(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
......@@ -304,7 +305,7 @@ class TFCamembertPooler(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert
class TFCamembertSelfAttention(tf.keras.layers.Layer):
class TFCamembertSelfAttention(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -319,16 +320,16 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense(
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
......@@ -437,15 +438,15 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert
class TFCamembertSelfOutput(tf.keras.layers.Layer):
class TFCamembertSelfOutput(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -468,7 +469,7 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert
class TFCamembertAttention(tf.keras.layers.Layer):
class TFCamembertAttention(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -520,11 +521,11 @@ class TFCamembertAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert
class TFCamembertIntermediate(tf.keras.layers.Layer):
class TFCamembertIntermediate(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
......@@ -550,15 +551,15 @@ class TFCamembertIntermediate(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert
class TFCamembertOutput(tf.keras.layers.Layer):
class TFCamembertOutput(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -581,7 +582,7 @@ class TFCamembertOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert
class TFCamembertLayer(tf.keras.layers.Layer):
class TFCamembertLayer(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -685,7 +686,7 @@ class TFCamembertLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert
class TFCamembertEncoder(tf.keras.layers.Layer):
class TFCamembertEncoder(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -765,7 +766,7 @@ class TFCamembertEncoder(tf.keras.layers.Layer):
@keras_serializable
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert
class TFCamembertMainLayer(tf.keras.layers.Layer):
class TFCamembertMainLayer(keras.layers.Layer):
config_class = CamembertConfig
def __init__(self, config, add_pooling_layer=True, **kwargs):
......@@ -785,7 +786,7 @@ class TFCamembertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFCamembertEmbeddings(config, name="embeddings")
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
......@@ -1068,7 +1069,7 @@ class TFCamembertModel(TFCamembertPreTrainedModel):
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert
class TFCamembertLMHead(tf.keras.layers.Layer):
class TFCamembertLMHead(keras.layers.Layer):
"""Camembert Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs):
......@@ -1076,10 +1077,10 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
self.config = config
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.act = get_tf_activation("gelu")
# The output weights are the same as the input embeddings, but there is
......@@ -1222,12 +1223,12 @@ class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelin
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead
class TFCamembertClassificationHead(tf.keras.layers.Layer):
class TFCamembertClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
......@@ -1236,8 +1237,8 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer):
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(classifier_dropout)
self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
......@@ -1371,8 +1372,8 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1463,8 +1464,8 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL
super().__init__(config, *inputs, **kwargs)
self.roberta = TFCamembertMainLayer(config, name="roberta")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1568,7 +1569,7 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw
self.num_labels = config.num_labels
self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
self.qa_outputs = tf.keras.layers.Dense(
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
......
......@@ -32,6 +32,7 @@ from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -77,7 +78,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
return tf.math.reduce_mean(
tf.keras.metrics.sparse_categorical_crossentropy(
keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
)
)
......@@ -127,7 +128,7 @@ class TFCLIPOutput(ModelOutput):
)
class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
class TFCLIPVisionEmbeddings(keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
......@@ -140,7 +141,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
self.config = config
self.patch_embedding = tf.keras.layers.Conv2D(
self.patch_embedding = keras.layers.Conv2D(
filters=self.embed_dim,
kernel_size=self.patch_size,
strides=self.patch_size,
......@@ -201,7 +202,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
return embeddings
class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
class TFCLIPTextEmbeddings(keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
......@@ -259,7 +260,7 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
return final_embeddings
class TFCLIPAttention(tf.keras.layers.Layer):
class TFCLIPAttention(keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: CLIPConfig, **kwargs):
......@@ -280,19 +281,19 @@ class TFCLIPAttention(tf.keras.layers.Layer):
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.q_proj = tf.keras.layers.Dense(
self.q_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
)
self.k_proj = tf.keras.layers.Dense(
self.k_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
)
self.v_proj = tf.keras.layers.Dense(
self.v_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout)
self.dropout = keras.layers.Dropout(rate=config.attention_dropout)
self.out_proj = tf.keras.layers.Dense(
self.out_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
)
......@@ -375,7 +376,7 @@ class TFCLIPAttention(tf.keras.layers.Layer):
self.out_proj.build([None, None, self.embed_dim])
class TFCLIPMLP(tf.keras.layers.Layer):
class TFCLIPMLP(keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
......@@ -385,10 +386,10 @@ class TFCLIPMLP(tf.keras.layers.Layer):
in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * config.hidden_size) ** -0.5 * factor
self.fc1 = tf.keras.layers.Dense(
self.fc1 = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
)
self.fc2 = tf.keras.layers.Dense(
self.fc2 = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
)
self.config = config
......@@ -411,15 +412,15 @@ class TFCLIPMLP(tf.keras.layers.Layer):
self.fc2.build([None, None, self.config.intermediate_size])
class TFCLIPEncoderLayer(tf.keras.layers.Layer):
class TFCLIPEncoderLayer(keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.self_attn = TFCLIPAttention(config, name="self_attn")
self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFCLIPMLP(config, name="mlp")
self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call(
self,
......@@ -480,7 +481,7 @@ class TFCLIPEncoderLayer(tf.keras.layers.Layer):
self.layer_norm2.build([None, None, self.embed_dim])
class TFCLIPEncoder(tf.keras.layers.Layer):
class TFCLIPEncoder(keras.layers.Layer):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`TFCLIPEncoderLayer`].
......@@ -544,15 +545,13 @@ class TFCLIPEncoder(tf.keras.layers.Layer):
layer.build(None)
class TFCLIPTextTransformer(tf.keras.layers.Layer):
class TFCLIPTextTransformer(keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs)
self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings")
self.encoder = TFCLIPEncoder(config, name="encoder")
self.final_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="final_layer_norm"
)
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
# For `pooled_output` computation
self.eos_token_id = config.eos_token_id
......@@ -663,7 +662,7 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
@keras_serializable
class TFCLIPTextMainLayer(tf.keras.layers.Layer):
class TFCLIPTextMainLayer(keras.layers.Layer):
config_class = CLIPTextConfig
def __init__(self, config: CLIPTextConfig, **kwargs):
......@@ -671,7 +670,7 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer):
self.config = config
self.text_model = TFCLIPTextTransformer(config, name="text_model")
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.text_model.embeddings
def set_input_embeddings(self, value: tf.Variable):
......@@ -718,14 +717,14 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer):
self.text_model.build(None)
class TFCLIPVisionTransformer(tf.keras.layers.Layer):
class TFCLIPVisionTransformer(keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs)
self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings")
self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
self.pre_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
self.encoder = TFCLIPEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size
def call(
......@@ -782,7 +781,7 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer):
@keras_serializable
class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
class TFCLIPVisionMainLayer(keras.layers.Layer):
config_class = CLIPVisionConfig
def __init__(self, config: CLIPVisionConfig, **kwargs):
......@@ -790,7 +789,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
self.config = config
self.vision_model = TFCLIPVisionTransformer(config, name="vision_model")
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings
@unpack_inputs
......@@ -825,7 +824,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
@keras_serializable
class TFCLIPMainLayer(tf.keras.layers.Layer):
class TFCLIPMainLayer(keras.layers.Layer):
config_class = CLIPConfig
def __init__(self, config: CLIPConfig, **kwargs):
......@@ -853,14 +852,14 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
self.text_model = TFCLIPTextTransformer(text_config, name="text_model")
self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model")
self.visual_projection = tf.keras.layers.Dense(
self.visual_projection = keras.layers.Dense(
units=self.projection_dim,
kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor),
use_bias=False,
name="visual_projection",
)
self.text_projection = tf.keras.layers.Dense(
self.text_projection = keras.layers.Dense(
units=self.projection_dim,
kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor),
use_bias=False,
......@@ -872,7 +871,7 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
def build(self, input_shape: tf.TensorShape = None):
self.logit_scale = self.add_weight(
shape=(1,),
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True,
name="logit_scale",
)
......@@ -1046,7 +1045,7 @@ CLIP_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......
......@@ -41,6 +41,7 @@ from ...modeling_tf_utils import (
TFSequenceSummary,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -68,7 +69,7 @@ TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->ConvBert
class TFConvBertEmbeddings(tf.keras.layers.Layer):
class TFConvBertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: ConvBertConfig, **kwargs):
......@@ -78,8 +79,8 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
......@@ -152,7 +153,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
return final_embeddings
class TFConvBertSelfAttention(tf.keras.layers.Layer):
class TFConvBertSelfAttention(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
......@@ -178,17 +179,17 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
self.attention_head_size = config.hidden_size // config.num_attention_heads
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense(
self.query = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.key = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.value = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.key_conv_attn_layer = tf.keras.layers.SeparableConv1D(
self.key_conv_attn_layer = keras.layers.SeparableConv1D(
self.all_head_size,
self.conv_kernel_size,
padding="same",
......@@ -198,21 +199,21 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
name="key_conv_attn_layer",
)
self.conv_kernel_layer = tf.keras.layers.Dense(
self.conv_kernel_layer = keras.layers.Dense(
self.num_attention_heads * self.conv_kernel_size,
activation=None,
name="conv_kernel_layer",
kernel_initializer=get_initializer(config.initializer_range),
)
self.conv_out_layer = tf.keras.layers.Dense(
self.conv_out_layer = keras.layers.Dense(
self.all_head_size,
activation=None,
name="conv_out_layer",
kernel_initializer=get_initializer(config.initializer_range),
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, x, batch_size):
......@@ -327,15 +328,15 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
self.conv_out_layer.build([None, None, self.config.hidden_size])
class TFConvBertSelfOutput(tf.keras.layers.Layer):
class TFConvBertSelfOutput(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False):
......@@ -357,7 +358,7 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFConvBertAttention(tf.keras.layers.Layer):
class TFConvBertAttention(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
......@@ -388,7 +389,7 @@ class TFConvBertAttention(tf.keras.layers.Layer):
self.dense_output.build(None)
class GroupedLinearLayer(tf.keras.layers.Layer):
class GroupedLinearLayer(keras.layers.Layer):
def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs):
super().__init__(**kwargs)
self.input_size = input_size
......@@ -421,11 +422,11 @@ class GroupedLinearLayer(tf.keras.layers.Layer):
return x
class TFConvBertIntermediate(tf.keras.layers.Layer):
class TFConvBertIntermediate(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
if config.num_groups == 1:
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
else:
......@@ -458,12 +459,12 @@ class TFConvBertIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size])
class TFConvBertOutput(tf.keras.layers.Layer):
class TFConvBertOutput(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
if config.num_groups == 1:
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
else:
......@@ -474,8 +475,8 @@ class TFConvBertOutput(tf.keras.layers.Layer):
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False):
......@@ -497,7 +498,7 @@ class TFConvBertOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.intermediate_size])
class TFConvBertLayer(tf.keras.layers.Layer):
class TFConvBertLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
......@@ -531,7 +532,7 @@ class TFConvBertLayer(tf.keras.layers.Layer):
self.bert_output.build(None)
class TFConvBertEncoder(tf.keras.layers.Layer):
class TFConvBertEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
......@@ -583,11 +584,11 @@ class TFConvBertEncoder(tf.keras.layers.Layer):
layer.build(None)
class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
class TFConvBertPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
......@@ -596,7 +597,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states):
......@@ -619,7 +620,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
@keras_serializable
class TFConvBertMainLayer(tf.keras.layers.Layer):
class TFConvBertMainLayer(keras.layers.Layer):
config_class = ConvBertConfig
def __init__(self, config, **kwargs):
......@@ -628,7 +629,7 @@ class TFConvBertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFConvBertEmbeddings(config, name="embeddings")
if config.embedding_size != config.hidden_size:
self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project")
self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")
self.encoder = TFConvBertEncoder(config, name="encoder")
self.config = config
......@@ -755,7 +756,7 @@ CONVBERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -901,7 +902,7 @@ class TFConvBertModel(TFConvBertPreTrainedModel):
self.convbert.build(None)
class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
class TFConvBertMaskedLMHead(keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
......@@ -938,12 +939,12 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
return hidden_states
class TFConvBertGeneratorPredictions(tf.keras.layers.Layer):
class TFConvBertGeneratorPredictions(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = keras.layers.Dense(config.embedding_size, name="dense")
self.config = config
def call(self, generator_hidden_states, training=False):
......@@ -1058,20 +1059,20 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
self.generator_lm_head.build(None)
class TFConvBertClassificationHead(tf.keras.layers.Layer):
class TFConvBertClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(classifier_dropout)
self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
......@@ -1193,7 +1194,7 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos
self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary"
)
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1302,8 +1303,8 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1386,7 +1387,7 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer
self.num_labels = config.num_labels
self.convbert = TFConvBertMainLayer(config, name="convbert")
self.qa_outputs = tf.keras.layers.Dense(
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
......
......@@ -29,6 +29,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -44,7 +45,7 @@ _CONFIG_FOR_DOC = "ConvNextConfig"
_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
class TFConvNextDropPath(tf.keras.layers.Layer):
class TFConvNextDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
......@@ -64,22 +65,22 @@ class TFConvNextDropPath(tf.keras.layers.Layer):
return x
class TFConvNextEmbeddings(tf.keras.layers.Layer):
class TFConvNextEmbeddings(keras.layers.Layer):
"""This class is comparable to (and inspired by) the SwinEmbeddings class
found in src/transformers/models/swin/modeling_swin.py.
"""
def __init__(self, config: ConvNextConfig, **kwargs):
super().__init__(**kwargs)
self.patch_embeddings = tf.keras.layers.Conv2D(
self.patch_embeddings = keras.layers.Conv2D(
filters=config.hidden_sizes[0],
kernel_size=config.patch_size,
strides=config.patch_size,
name="patch_embeddings",
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
)
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels
self.config = config
......@@ -93,7 +94,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
)
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
# When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
# So change the input format from `NCHW` to `NHWC`.
# shape = (batch_size, in_height, in_width, in_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
......@@ -114,7 +115,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextLayer(tf.keras.layers.Layer):
class TFConvNextLayer(keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation.
There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
......@@ -133,7 +134,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
super().__init__(**kwargs)
self.dim = dim
self.config = config
self.dwconv = tf.keras.layers.Conv2D(
self.dwconv = keras.layers.Conv2D(
filters=dim,
kernel_size=7,
padding="same",
......@@ -142,18 +143,18 @@ class TFConvNextLayer(tf.keras.layers.Layer):
bias_initializer="zeros",
name="dwconv",
) # depthwise conv
self.layernorm = tf.keras.layers.LayerNormalization(
self.layernorm = keras.layers.LayerNormalization(
epsilon=1e-6,
name="layernorm",
)
self.pwconv1 = tf.keras.layers.Dense(
self.pwconv1 = keras.layers.Dense(
units=4 * dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
name="pwconv1",
) # pointwise/1x1 convs, implemented with linear layers
self.act = get_tf_activation(config.hidden_act)
self.pwconv2 = tf.keras.layers.Dense(
self.pwconv2 = keras.layers.Dense(
units=dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
......@@ -164,7 +165,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
self.drop_path = (
TFConvNextDropPath(drop_path, name="drop_path")
if drop_path > 0.0
else tf.keras.layers.Activation("linear", name="drop_path")
else keras.layers.Activation("linear", name="drop_path")
)
def build(self, input_shape: tf.TensorShape = None):
......@@ -172,7 +173,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
self.layer_scale_parameter = (
self.add_weight(
shape=(self.dim,),
initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
trainable=True,
name="layer_scale_parameter",
)
......@@ -214,7 +215,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
return x
class TFConvNextStage(tf.keras.layers.Layer):
class TFConvNextStage(keras.layers.Layer):
"""ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks.
Args:
......@@ -244,7 +245,7 @@ class TFConvNextStage(tf.keras.layers.Layer):
super().__init__(**kwargs)
if in_channels != out_channels or stride > 1:
self.downsampling_layer = [
tf.keras.layers.LayerNormalization(
keras.layers.LayerNormalization(
epsilon=1e-6,
name="downsampling_layer.0",
),
......@@ -253,12 +254,12 @@ class TFConvNextStage(tf.keras.layers.Layer):
# layer. All the outputs throughout the model will be in NHWC
# from this point on until the output where we again change to
# NCHW.
tf.keras.layers.Conv2D(
keras.layers.Conv2D(
filters=out_channels,
kernel_size=kernel_size,
strides=stride,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
name="downsampling_layer.1",
),
]
......@@ -301,7 +302,7 @@ class TFConvNextStage(tf.keras.layers.Layer):
self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextEncoder(tf.keras.layers.Layer):
class TFConvNextEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.stages = []
......@@ -347,7 +348,7 @@ class TFConvNextEncoder(tf.keras.layers.Layer):
@keras_serializable
class TFConvNextMainLayer(tf.keras.layers.Layer):
class TFConvNextMainLayer(keras.layers.Layer):
config_class = ConvNextConfig
def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
......@@ -356,10 +357,10 @@ class TFConvNextMainLayer(tf.keras.layers.Layer):
self.config = config
self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
self.encoder = TFConvNextEncoder(config, name="encoder")
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# We are setting the `data_format` like so because from here on we will revert to the
# NCHW output format
self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None
self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None
@unpack_inputs
def call(
......@@ -436,7 +437,7 @@ CONVNEXT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -575,7 +576,7 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas
self.convnext = TFConvNextMainLayer(config, name="convnext")
# Classifier head
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros",
......
......@@ -34,6 +34,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -67,7 +68,7 @@ CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->ConvNextV2
class TFConvNextV2DropPath(tf.keras.layers.Layer):
class TFConvNextV2DropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
......@@ -87,7 +88,7 @@ class TFConvNextV2DropPath(tf.keras.layers.Layer):
return x
class TFConvNextV2GRN(tf.keras.layers.Layer):
class TFConvNextV2GRN(keras.layers.Layer):
"""GRN (Global Response Normalization) layer"""
def __init__(self, config: ConvNextV2Config, dim: int, **kwargs):
......@@ -99,12 +100,12 @@ class TFConvNextV2GRN(tf.keras.layers.Layer):
self.weight = self.add_weight(
name="weight",
shape=(1, 1, 1, self.dim),
initializer=tf.keras.initializers.Zeros(),
initializer=keras.initializers.Zeros(),
)
self.bias = self.add_weight(
name="bias",
shape=(1, 1, 1, self.dim),
initializer=tf.keras.initializers.Zeros(),
initializer=keras.initializers.Zeros(),
)
return super().build(input_shape)
......@@ -116,22 +117,22 @@ class TFConvNextV2GRN(tf.keras.layers.Layer):
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextEmbeddings with ConvNext->ConvNextV2
class TFConvNextV2Embeddings(tf.keras.layers.Layer):
class TFConvNextV2Embeddings(keras.layers.Layer):
"""This class is comparable to (and inspired by) the SwinEmbeddings class
found in src/transformers/models/swin/modeling_swin.py.
"""
def __init__(self, config: ConvNextV2Config, **kwargs):
super().__init__(**kwargs)
self.patch_embeddings = tf.keras.layers.Conv2D(
self.patch_embeddings = keras.layers.Conv2D(
filters=config.hidden_sizes[0],
kernel_size=config.patch_size,
strides=config.patch_size,
name="patch_embeddings",
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
)
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels
self.config = config
......@@ -145,7 +146,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
)
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
# When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
# So change the input format from `NCHW` to `NHWC`.
# shape = (batch_size, in_height, in_width, in_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
......@@ -166,7 +167,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextV2Layer(tf.keras.layers.Layer):
class TFConvNextV2Layer(keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation.
There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
......@@ -188,31 +189,31 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
super().__init__(**kwargs)
self.dim = dim
self.config = config
self.dwconv = tf.keras.layers.Conv2D(
self.dwconv = keras.layers.Conv2D(
filters=dim,
kernel_size=7,
padding="same",
groups=dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
name="dwconv",
) # depthwise conv
self.layernorm = tf.keras.layers.LayerNormalization(
self.layernorm = keras.layers.LayerNormalization(
epsilon=1e-6,
name="layernorm",
)
self.pwconv1 = tf.keras.layers.Dense(
self.pwconv1 = keras.layers.Dense(
units=4 * dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
name="pwconv1",
) # pointwise/1x1 convs, implemented with linear layers
self.act = get_tf_activation(config.hidden_act)
self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn")
self.pwconv2 = tf.keras.layers.Dense(
self.pwconv2 = keras.layers.Dense(
units=dim,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
name="pwconv2",
)
# Using `layers.Activation` instead of `tf.identity` to better control `training`
......@@ -220,7 +221,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
self.drop_path = (
TFConvNextV2DropPath(drop_path, name="drop_path")
if drop_path > 0.0
else tf.keras.layers.Activation("linear", name="drop_path")
else keras.layers.Activation("linear", name="drop_path")
)
def call(self, hidden_states, training=False):
......@@ -260,7 +261,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2
class TFConvNextV2Stage(tf.keras.layers.Layer):
class TFConvNextV2Stage(keras.layers.Layer):
"""ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks.
Args:
......@@ -290,7 +291,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
super().__init__(**kwargs)
if in_channels != out_channels or stride > 1:
self.downsampling_layer = [
tf.keras.layers.LayerNormalization(
keras.layers.LayerNormalization(
epsilon=1e-6,
name="downsampling_layer.0",
),
......@@ -299,12 +300,12 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
# layer. All the outputs throughout the model will be in NHWC
# from this point on until the output where we again change to
# NCHW.
tf.keras.layers.Conv2D(
keras.layers.Conv2D(
filters=out_channels,
kernel_size=kernel_size,
strides=stride,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
name="downsampling_layer.1",
),
]
......@@ -347,7 +348,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextV2Encoder(tf.keras.layers.Layer):
class TFConvNextV2Encoder(keras.layers.Layer):
def __init__(self, config: ConvNextV2Config, **kwargs):
super().__init__(**kwargs)
self.stages = []
......@@ -398,7 +399,7 @@ class TFConvNextV2Encoder(tf.keras.layers.Layer):
@keras_serializable
class TFConvNextV2MainLayer(tf.keras.layers.Layer):
class TFConvNextV2MainLayer(keras.layers.Layer):
config_class = ConvNextV2Config
def __init__(self, config: ConvNextV2Config, **kwargs):
......@@ -407,10 +408,10 @@ class TFConvNextV2MainLayer(tf.keras.layers.Layer):
self.config = config
self.embeddings = TFConvNextV2Embeddings(config, name="embeddings")
self.encoder = TFConvNextV2Encoder(config, name="encoder")
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# We are setting the `data_format` like so because from here on we will revert to the
# NCHW output format
self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_last")
self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_last")
@unpack_inputs
def call(
......@@ -489,7 +490,7 @@ CONVNEXTV2_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -614,10 +615,10 @@ class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequence
self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2")
# Classifier head
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(),
bias_initializer=keras.initializers.Zeros(),
name="classifier",
)
......
......@@ -29,6 +29,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -90,7 +91,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
return output, attention_weights
class TFMultiHeadAttention(tf.keras.layers.Layer):
class TFMultiHeadAttention(keras.layers.Layer):
def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
super().__init__(**kwargs)
self.num_heads = num_heads
......@@ -99,11 +100,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self.depth = int(d_model_size / self.num_heads)
self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq")
self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk")
self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv")
self.Wq = keras.layers.Dense(d_model_size, name="Wq")
self.Wk = keras.layers.Dense(d_model_size, name="Wk")
self.Wv = keras.layers.Dense(d_model_size, name="Wv")
self.dense = tf.keras.layers.Dense(d_model_size, name="dense")
self.dense = keras.layers.Dense(d_model_size, name="dense")
def split_into_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
......@@ -160,12 +161,12 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self.dense.build([None, None, self.d_model_size])
class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
class TFPointWiseFeedForwardLayer(keras.layers.Layer):
def __init__(self, d_model_size, dff, **kwargs):
super().__init__(**kwargs)
self.dense_0 = tf.keras.layers.Dense(dff, activation="relu", name="0")
self.dense_2 = tf.keras.layers.Dense(d_model_size, name="2")
self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0")
self.dense_2 = keras.layers.Dense(d_model_size, name="2")
self.d_model_size = d_model_size
self.dff = dff
......@@ -187,7 +188,7 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
self.dense_2.build([None, None, self.dff])
class TFEncoderLayer(tf.keras.layers.Layer):
class TFEncoderLayer(keras.layers.Layer):
def __init__(
self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
):
......@@ -200,11 +201,11 @@ class TFEncoderLayer(tf.keras.layers.Layer):
)
self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn")
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = keras.layers.Dropout(rate)
self.d_model_size = d_model_size
def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
......@@ -252,7 +253,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
@keras_serializable
class TFCTRLMainLayer(tf.keras.layers.Layer):
class TFCTRLMainLayer(keras.layers.Layer):
config_class = CTRLConfig
def __init__(self, config, **kwargs):
......@@ -269,14 +270,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
self.w = tf.keras.layers.Embedding(
self.w = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name="w",
)
self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
self.dropout = keras.layers.Dropout(config.embd_pdrop)
self.h = [
TFEncoderLayer(
config.n_embd,
......@@ -289,7 +290,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
)
for i in range(config.n_layer)
]
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
def get_input_embeddings(self):
return self.w
......@@ -476,7 +477,7 @@ CTRL_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -635,9 +636,9 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
self.transformer.build(None)
class TFCTRLBiasLayer(tf.keras.layers.Layer):
class TFCTRLBiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis,
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
......@@ -812,7 +813,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
......
......@@ -29,6 +29,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -80,7 +81,7 @@ class TFBaseModelOutputWithCLSToken(ModelOutput):
hidden_states: Tuple[tf.Tensor, ...] | None = None
class TFCvtDropPath(tf.keras.layers.Layer):
class TFCvtDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
......@@ -100,7 +101,7 @@ class TFCvtDropPath(tf.keras.layers.Layer):
return (x / keep_prob) * random_tensor
class TFCvtEmbeddings(tf.keras.layers.Layer):
class TFCvtEmbeddings(keras.layers.Layer):
"""Construct the Convolutional Token Embeddings."""
def __init__(
......@@ -124,7 +125,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
padding=padding,
name="convolution_embeddings",
)
self.dropout = tf.keras.layers.Dropout(dropout_rate)
self.dropout = keras.layers.Dropout(dropout_rate)
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution_embeddings(pixel_values)
......@@ -140,7 +141,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
self.convolution_embeddings.build(None)
class TFCvtConvEmbeddings(tf.keras.layers.Layer):
class TFCvtConvEmbeddings(keras.layers.Layer):
"""Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
def __init__(
......@@ -154,9 +155,9 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
**kwargs,
):
super().__init__(**kwargs)
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding)
self.padding = keras.layers.ZeroPadding2D(padding=padding)
self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
self.projection = tf.keras.layers.Conv2D(
self.projection = keras.layers.Conv2D(
filters=embed_dim,
kernel_size=patch_size,
strides=stride,
......@@ -166,7 +167,7 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
name="projection",
)
# Using the same default epsilon as PyTorch
self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
self.num_channels = num_channels
self.embed_dim = embed_dim
......@@ -198,13 +199,13 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
self.normalization.build([None, None, self.embed_dim])
class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
class TFCvtSelfAttentionConvProjection(keras.layers.Layer):
"""Convolutional projection layer."""
def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
super().__init__(**kwargs)
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding)
self.convolution = tf.keras.layers.Conv2D(
self.padding = keras.layers.ZeroPadding2D(padding=padding)
self.convolution = keras.layers.Conv2D(
filters=embed_dim,
kernel_size=kernel_size,
kernel_initializer=get_initializer(config.initializer_range),
......@@ -215,7 +216,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
groups=embed_dim,
)
# Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -235,7 +236,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
self.normalization.build([None, None, None, self.embed_dim])
class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer):
class TFCvtSelfAttentionLinearProjection(keras.layers.Layer):
"""Linear projection layer used to flatten tokens into 1D."""
def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
......@@ -246,7 +247,7 @@ class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer):
return hidden_state
class TFCvtSelfAttentionProjection(tf.keras.layers.Layer):
class TFCvtSelfAttentionProjection(keras.layers.Layer):
"""Convolutional Projection for Attention."""
def __init__(
......@@ -280,7 +281,7 @@ class TFCvtSelfAttentionProjection(tf.keras.layers.Layer):
self.convolution_projection.build(None)
class TFCvtSelfAttention(tf.keras.layers.Layer):
class TFCvtSelfAttention(keras.layers.Layer):
"""
Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
query, key, and value embeddings.
......@@ -336,28 +337,28 @@ class TFCvtSelfAttention(tf.keras.layers.Layer):
name="convolution_projection_value",
)
self.projection_query = tf.keras.layers.Dense(
self.projection_query = keras.layers.Dense(
units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias,
bias_initializer="zeros",
name="projection_query",
)
self.projection_key = tf.keras.layers.Dense(
self.projection_key = keras.layers.Dense(
units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias,
bias_initializer="zeros",
name="projection_key",
)
self.projection_value = tf.keras.layers.Dense(
self.projection_value = keras.layers.Dense(
units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias,
bias_initializer="zeros",
name="projection_value",
)
self.dropout = tf.keras.layers.Dropout(attention_drop_rate)
self.dropout = keras.layers.Dropout(attention_drop_rate)
def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
batch_size, hidden_size, _ = shape_list(hidden_state)
......@@ -424,15 +425,15 @@ class TFCvtSelfAttention(tf.keras.layers.Layer):
self.projection_value.build([None, None, self.embed_dim])
class TFCvtSelfOutput(tf.keras.layers.Layer):
class TFCvtSelfOutput(keras.layers.Layer):
"""Output of the Attention layer ."""
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(drop_rate)
self.dropout = keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -449,7 +450,7 @@ class TFCvtSelfOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.embed_dim])
class TFCvtAttention(tf.keras.layers.Layer):
class TFCvtAttention(keras.layers.Layer):
"""Attention layer. First chunk of the convolutional transformer block."""
def __init__(
......@@ -507,12 +508,12 @@ class TFCvtAttention(tf.keras.layers.Layer):
self.dense_output.build(None)
class TFCvtIntermediate(tf.keras.layers.Layer):
class TFCvtIntermediate(keras.layers.Layer):
"""Intermediate dense layer. Second chunk of the convolutional transformer block."""
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=int(embed_dim * mlp_ratio),
kernel_initializer=get_initializer(config.initializer_range),
activation="gelu",
......@@ -533,17 +534,17 @@ class TFCvtIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.embed_dim])
class TFCvtOutput(tf.keras.layers.Layer):
class TFCvtOutput(keras.layers.Layer):
"""
Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
"""
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(drop_rate)
self.dropout = keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim
self.mlp_ratio = mlp_ratio
......@@ -562,7 +563,7 @@ class TFCvtOutput(tf.keras.layers.Layer):
self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)])
class TFCvtLayer(tf.keras.layers.Layer):
class TFCvtLayer(keras.layers.Layer):
"""
Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
......@@ -611,11 +612,11 @@ class TFCvtLayer(tf.keras.layers.Layer):
self.drop_path = (
TFCvtDropPath(drop_path_rate, name="drop_path")
if drop_path_rate > 0.0
else tf.keras.layers.Activation("linear", name="drop_path")
else keras.layers.Activation("linear", name="drop_path")
)
# Using the same default epsilon as PyTorch
self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
......@@ -659,7 +660,7 @@ class TFCvtLayer(tf.keras.layers.Layer):
self.layernorm_after.build([None, None, self.embed_dim])
class TFCvtStage(tf.keras.layers.Layer):
class TFCvtStage(keras.layers.Layer):
"""
Cvt stage (encoder block). Each stage has 2 parts :
- (1) A Convolutional Token Embedding layer
......@@ -755,7 +756,7 @@ class TFCvtStage(tf.keras.layers.Layer):
layer.build(None)
class TFCvtEncoder(tf.keras.layers.Layer):
class TFCvtEncoder(keras.layers.Layer):
"""
Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
(depth) being 1, 2 and 10.
......@@ -782,7 +783,7 @@ class TFCvtEncoder(tf.keras.layers.Layer):
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
hidden_state = pixel_values
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width)
# When running on CPU, `keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width)
# as input format. So change the input format to (batch_size, height, width, num_channels).
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))
......@@ -817,7 +818,7 @@ class TFCvtEncoder(tf.keras.layers.Layer):
@keras_serializable
class TFCvtMainLayer(tf.keras.layers.Layer):
class TFCvtMainLayer(keras.layers.Layer):
"""Construct the Cvt model."""
config_class = CvtConfig
......@@ -882,7 +883,7 @@ TFCVT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -893,7 +894,7 @@ TFCVT_START_DOCSTRING = r"""
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
tensors in the first argument of the model call function: `model(inputs)`.
</Tip>
......@@ -1006,10 +1007,10 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification
self.num_labels = config.num_labels
self.cvt = TFCvtMainLayer(config, name="cvt")
# Using same default epsilon as in the original implementation.
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
# Classifier head
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=True,
......
......@@ -37,6 +37,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -101,7 +102,7 @@ class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
attentions: Tuple[tf.Tensor] | None = None
class TFData2VecVisionDropPath(tf.keras.layers.Layer):
class TFData2VecVisionDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
......@@ -121,7 +122,7 @@ class TFData2VecVisionDropPath(tf.keras.layers.Layer):
return x
class TFData2VecVisionEmbeddings(tf.keras.layers.Layer):
class TFData2VecVisionEmbeddings(keras.layers.Layer):
"""
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
......@@ -135,7 +136,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer):
self.num_patches = self.patch_embeddings.num_patches
self.config = config
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape=None):
self.cls_token = self.add_weight(
......@@ -193,7 +194,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer):
return embeddings
class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer):
class TFData2VecVisionPatchEmbeddings(keras.layers.Layer):
"""
Image to Patch Embedding.
"""
......@@ -215,7 +216,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer):
self.patch_shape = patch_shape
self.num_channels = num_channels
self.projection = tf.keras.layers.Conv2D(
self.projection = keras.layers.Conv2D(
filters=hidden_size,
kernel_size=patch_size,
strides=patch_size,
......@@ -240,7 +241,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer):
f" ({self.image_size[0]}*{self.image_size[1]})."
)
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
# When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
# So change the input format from `NCHW` to `NHWC`.
# shape = (batch_size, in_height, in_width, in_channels=num_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
......@@ -262,7 +263,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer):
self.projection.build([None, None, None, self.num_channels])
class TFData2VecVisionSelfAttention(tf.keras.layers.Layer):
class TFData2VecVisionSelfAttention(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs)
......@@ -277,19 +278,19 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense(
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.key = keras.layers.Dense(
units=self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="key",
use_bias=False,
)
self.value = tf.keras.layers.Dense(
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
if window_size:
self.relative_position_bias = TFData2VecVisionRelativePositionBias(
......@@ -376,7 +377,7 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer):
self.relative_position_bias.build(None)
class TFData2VecVisionSelfOutput(tf.keras.layers.Layer):
class TFData2VecVisionSelfOutput(keras.layers.Layer):
"""
The residual connection is defined in TFData2VecVisionLayer instead of here (as is the case with other models), due
to the layernorm applied before each block.
......@@ -385,10 +386,10 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor:
......@@ -406,7 +407,7 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size])
class TFData2VecVisionAttention(tf.keras.layers.Layer):
class TFData2VecVisionAttention(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs)
......@@ -451,11 +452,11 @@ class TFData2VecVisionAttention(tf.keras.layers.Layer):
# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision
class TFData2VecVisionIntermediate(tf.keras.layers.Layer):
class TFData2VecVisionIntermediate(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
......@@ -480,14 +481,14 @@ class TFData2VecVisionIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size])
class TFData2VecVisionOutput(tf.keras.layers.Layer):
class TFData2VecVisionOutput(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -505,7 +506,7 @@ class TFData2VecVisionOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.intermediate_size])
class TFData2VecVisionLayer(tf.keras.layers.Layer):
class TFData2VecVisionLayer(keras.layers.Layer):
"""This corresponds to the Block class in the timm implementation."""
def __init__(
......@@ -518,18 +519,14 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer):
self.intermediate = TFData2VecVisionIntermediate(config, name="intermediate")
self.data2vec_output = TFData2VecVisionOutput(config, name="output")
self.layernorm_before = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="layernorm_before"
)
self.layernorm_after = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="layernorm_after"
)
self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
# Using `layers.Activation` instead of `tf.identity` to better control `training`
# behaviour.
self.drop_path = (
TFData2VecVisionDropPath(drop_path_rate, name="drop_path")
if drop_path_rate > 0.0
else tf.keras.layers.Activation("linear", name="drop_path")
else keras.layers.Activation("linear", name="drop_path")
)
self.init_values = config.layer_scale_init_value
......@@ -619,7 +616,7 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer):
# Taken and modified from here:
# https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/beit/beit.py#L28
class TFData2VecVisionRelativePositionBias(tf.keras.layers.Layer):
class TFData2VecVisionRelativePositionBias(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: tuple, **kwargs) -> None:
super().__init__(**kwargs)
self.config = config
......@@ -675,7 +672,7 @@ class TFData2VecVisionRelativePositionBias(tf.keras.layers.Layer):
return tf.transpose(relative_position_bias, [2, 0, 1])
class TFData2VecVisionEncoder(tf.keras.layers.Layer):
class TFData2VecVisionEncoder(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -753,7 +750,7 @@ class TFData2VecVisionEncoder(tf.keras.layers.Layer):
@keras_serializable
class TFData2VecVisionMainLayer(tf.keras.layers.Layer):
class TFData2VecVisionMainLayer(keras.layers.Layer):
config_class = Data2VecVisionConfig
def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = True, **kwargs):
......@@ -769,14 +766,14 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer):
self.layernorm = (
tf.identity
if config.use_mean_pooling
else tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
else keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
)
# We are setting the `data_format` like so because from here on we will revert to the
# NCHW output format
self.pooler = TFData2VecVisionPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
......@@ -861,11 +858,11 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer):
self.pooler.build(None)
class TFData2VecVisionPooler(tf.keras.layers.Layer):
class TFData2VecVisionPooler(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
self.layernorm = (
tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
if config.use_mean_pooling
else None
)
......@@ -909,7 +906,7 @@ DATA2VEC_VISION_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.).
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -1049,7 +1046,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF
self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=True, name="data2vec_vision")
# Classifier head
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
......@@ -1118,7 +1115,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF
self.classifier.build([None, None, self.config.hidden_size])
class TFData2VecVisionConvModule(tf.keras.layers.Layer):
class TFData2VecVisionConvModule(keras.layers.Layer):
"""
A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
......@@ -1137,7 +1134,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer):
**kwargs,
) -> None:
super().__init__(**kwargs)
self.conv = tf.keras.layers.Conv2D(
self.conv = keras.layers.Conv2D(
filters=out_channels,
kernel_size=kernel_size,
padding=padding,
......@@ -1145,7 +1142,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer):
dilation_rate=dilation,
name="conv",
)
self.bn = tf.keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5)
self.bn = keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5)
self.activation = tf.nn.relu
self.in_channels = in_channels
self.out_channels = out_channels
......@@ -1168,7 +1165,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer):
self.bn.build((None, None, None, self.out_channels))
class TFAdaptiveAvgPool2D(tf.keras.layers.Layer):
class TFAdaptiveAvgPool2D(keras.layers.Layer):
def __init__(self, output_dims: Tuple[int, int], input_ordering: str = "NHWC", **kwargs):
super().__init__(**kwargs)
self.output_dims = output_dims
......@@ -1292,7 +1289,7 @@ class TFAdaptiveAvgPool2D(tf.keras.layers.Layer):
return self.pseudo_1d_pool(h_pooled, h_pooling=False)
class TFData2VecVisionPyramidPoolingModule(tf.keras.layers.Layer):
class TFData2VecVisionPyramidPoolingModule(keras.layers.Layer):
"""
Pyramid Pooling Module (PPM) used in PSPNet.
......@@ -1342,7 +1339,7 @@ class TFData2VecVisionPyramidPoolingModule(tf.keras.layers.Layer):
layer_module.build(None)
class TFData2VecVisionUperHead(tf.keras.layers.Layer):
class TFData2VecVisionUperHead(keras.layers.Layer):
"""
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://arxiv.org/abs/1807.10221).
......@@ -1356,7 +1353,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer):
self.pool_scales = config.pool_scales # e.g. (1, 2, 3, 6)
self.in_channels = [config.hidden_size] * 4 # e.g. [768, 768, 768, 768]
self.channels = config.hidden_size
self.classifier = tf.keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
# PSP Module
self.psp_modules = TFData2VecVisionPyramidPoolingModule(
......@@ -1452,7 +1449,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer):
layer.build(None)
class TFData2VecVisionFCNHead(tf.keras.layers.Layer):
class TFData2VecVisionFCNHead(keras.layers.Layer):
"""
Fully Convolution Networks for Semantic Segmentation. This head is implemented from
[FCNNet](https://arxiv.org/abs/1411.4038).
......@@ -1516,7 +1513,7 @@ class TFData2VecVisionFCNHead(tf.keras.layers.Layer):
name="conv_cat",
)
self.classifier = tf.keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
# just take the relevant feature maps
......@@ -1555,15 +1552,15 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
# FPNs
self.fpn1 = [
tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"),
tf.keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5),
tf.keras.layers.Activation("gelu"),
tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"),
keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"),
keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5),
keras.layers.Activation("gelu"),
keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"),
]
self.fpn2 = [tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0")]
self.fpn2 = [keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0")]
self.fpn3 = tf.identity
self.fpn4 = tf.keras.layers.MaxPool2D(pool_size=2, strides=2)
self.fpn4 = keras.layers.MaxPool2D(pool_size=2, strides=2)
# Semantic segmentation head(s)
self.decode_head = TFData2VecVisionUperHead(config, name="decode_head")
......@@ -1582,7 +1579,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
if auxiliary_logits is not None:
upsampled_auxiliary_logits = tf.image.resize(auxiliary_logits, size=label_interp_shape, method="bilinear")
# compute weighted loss
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
# Copied from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
# Utility to mask the index to ignore during computing the loss.
......
......@@ -39,6 +39,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
......@@ -58,10 +59,10 @@ TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
class TFDebertaContextPooler(tf.keras.layers.Layer):
class TFDebertaContextPooler(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout")
self.config = config
......@@ -90,7 +91,7 @@ class TFDebertaContextPooler(tf.keras.layers.Layer):
self.dropout.build(None)
class TFDebertaXSoftmax(tf.keras.layers.Layer):
class TFDebertaXSoftmax(keras.layers.Layer):
"""
Masked Softmax which is optimized for saving memory
......@@ -112,7 +113,7 @@ class TFDebertaXSoftmax(tf.keras.layers.Layer):
return output
class TFDebertaStableDropout(tf.keras.layers.Layer):
class TFDebertaStableDropout(keras.layers.Layer):
"""
Optimized dropout module for stabilizing the training
......@@ -152,7 +153,7 @@ class TFDebertaStableDropout(tf.keras.layers.Layer):
return inputs
class TFDebertaLayerNorm(tf.keras.layers.Layer):
class TFDebertaLayerNorm(keras.layers.Layer):
"""LayerNorm module in the TF style (epsilon inside the square root)."""
def __init__(self, size, eps=1e-12, **kwargs):
......@@ -172,11 +173,11 @@ class TFDebertaLayerNorm(tf.keras.layers.Layer):
return self.gamma * (x - mean) / std + self.beta
class TFDebertaSelfOutput(tf.keras.layers.Layer):
class TFDebertaSelfOutput(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
......@@ -201,7 +202,7 @@ class TFDebertaSelfOutput(tf.keras.layers.Layer):
self.dropout.build(None)
class TFDebertaAttention(tf.keras.layers.Layer):
class TFDebertaAttention(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.self = TFDebertaDisentangledSelfAttention(config, name="self")
......@@ -249,11 +250,11 @@ class TFDebertaAttention(tf.keras.layers.Layer):
self.dense_output.build(None)
class TFDebertaIntermediate(tf.keras.layers.Layer):
class TFDebertaIntermediate(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
......@@ -278,14 +279,14 @@ class TFDebertaIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size])
class TFDebertaOutput(tf.keras.layers.Layer):
class TFDebertaOutput(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
......@@ -311,7 +312,7 @@ class TFDebertaOutput(tf.keras.layers.Layer):
self.dropout.build(None)
class TFDebertaLayer(tf.keras.layers.Layer):
class TFDebertaLayer(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
......@@ -362,7 +363,7 @@ class TFDebertaLayer(tf.keras.layers.Layer):
self.bert_output.build(None)
class TFDebertaEncoder(tf.keras.layers.Layer):
class TFDebertaEncoder(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
......@@ -543,7 +544,7 @@ def torch_gather(x, indices, gather_axis):
return gathered
class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
"""
Disentangled self-attention module
......@@ -564,7 +565,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.in_proj = tf.keras.layers.Dense(
self.in_proj = keras.layers.Dense(
self.all_head_size * 3,
kernel_initializer=get_initializer(config.initializer_range),
name="in_proj",
......@@ -576,13 +577,13 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
self.talking_head = getattr(config, "talking_head", False)
if self.talking_head:
self.head_logits_proj = tf.keras.layers.Dense(
self.head_logits_proj = keras.layers.Dense(
self.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range),
name="head_logits_proj",
use_bias=False,
)
self.head_weights_proj = tf.keras.layers.Dense(
self.head_weights_proj = keras.layers.Dense(
self.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range),
name="head_weights_proj",
......@@ -597,14 +598,14 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
self.max_relative_positions = config.max_position_embeddings
self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout")
if "c2p" in self.pos_att_type:
self.pos_proj = tf.keras.layers.Dense(
self.pos_proj = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="pos_proj",
use_bias=False,
)
if "p2c" in self.pos_att_type:
self.pos_q_proj = tf.keras.layers.Dense(
self.pos_q_proj = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj"
)
......@@ -616,10 +617,10 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
return
self.built = True
self.q_bias = self.add_weight(
name="q_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros()
name="q_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
)
self.v_bias = self.add_weight(
name="v_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros()
name="v_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
)
if getattr(self, "in_proj", None) is not None:
with tf.name_scope(self.in_proj.name):
......@@ -818,7 +819,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
return score
class TFDebertaEmbeddings(tf.keras.layers.Layer):
class TFDebertaEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
......@@ -831,13 +832,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
self.position_biased_input = getattr(config, "position_biased_input", True)
self.initializer_range = config.initializer_range
if self.embedding_size != config.hidden_size:
self.embed_proj = tf.keras.layers.Dense(
self.embed_proj = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embed_proj",
use_bias=False,
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
def build(self, input_shape=None):
......@@ -937,13 +938,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
return final_embeddings
class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer):
class TFDebertaPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=self.embedding_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
......@@ -953,7 +954,7 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
......@@ -975,8 +976,8 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.embedding_size])
class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
class TFDebertaLMPredictionHead(keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -998,7 +999,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer:
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
......@@ -1023,8 +1024,8 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
return hidden_states
class TFDebertaOnlyMLMHead(tf.keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
class TFDebertaOnlyMLMHead(keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions")
......@@ -1043,7 +1044,7 @@ class TFDebertaOnlyMLMHead(tf.keras.layers.Layer):
# @keras_serializable
class TFDebertaMainLayer(tf.keras.layers.Layer):
class TFDebertaMainLayer(keras.layers.Layer):
config_class = DebertaConfig
def __init__(self, config: DebertaConfig, **kwargs):
......@@ -1054,7 +1055,7 @@ class TFDebertaMainLayer(tf.keras.layers.Layer):
self.embeddings = TFDebertaEmbeddings(config, name="embeddings")
self.encoder = TFDebertaEncoder(config, name="encoder")
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
......@@ -1153,7 +1154,7 @@ DEBERTA_START_DOCSTRING = r"""
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -1299,7 +1300,7 @@ class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLos
self.deberta = TFDebertaMainLayer(config, name="deberta")
self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
@unpack_inputs
......@@ -1385,7 +1386,7 @@ class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceCla
drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout")
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
......@@ -1479,8 +1480,8 @@ class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassific
self.num_labels = config.num_labels
self.deberta = TFDebertaMainLayer(config, name="deberta")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1562,7 +1563,7 @@ class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnswerin
self.num_labels = config.num_labels
self.deberta = TFDebertaMainLayer(config, name="deberta")
self.qa_outputs = tf.keras.layers.Dense(
self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
......
......@@ -39,6 +39,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
......@@ -58,10 +59,10 @@ TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaContextPooler with Deberta->DebertaV2
class TFDebertaV2ContextPooler(tf.keras.layers.Layer):
class TFDebertaV2ContextPooler(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dropout = TFDebertaV2StableDropout(config.pooler_dropout, name="dropout")
self.config = config
......@@ -91,7 +92,7 @@ class TFDebertaV2ContextPooler(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaXSoftmax with Deberta->DebertaV2
class TFDebertaV2XSoftmax(tf.keras.layers.Layer):
class TFDebertaV2XSoftmax(keras.layers.Layer):
"""
Masked Softmax which is optimized for saving memory
......@@ -114,7 +115,7 @@ class TFDebertaV2XSoftmax(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout with Deberta->DebertaV2
class TFDebertaV2StableDropout(tf.keras.layers.Layer):
class TFDebertaV2StableDropout(keras.layers.Layer):
"""
Optimized dropout module for stabilizing the training
......@@ -155,11 +156,11 @@ class TFDebertaV2StableDropout(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaSelfOutput with Deberta->DebertaV2
class TFDebertaV2SelfOutput(tf.keras.layers.Layer):
class TFDebertaV2SelfOutput(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
......@@ -185,7 +186,7 @@ class TFDebertaV2SelfOutput(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2
class TFDebertaV2Attention(tf.keras.layers.Layer):
class TFDebertaV2Attention(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.self = TFDebertaV2DisentangledSelfAttention(config, name="self")
......@@ -234,11 +235,11 @@ class TFDebertaV2Attention(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2
class TFDebertaV2Intermediate(tf.keras.layers.Layer):
class TFDebertaV2Intermediate(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
......@@ -264,14 +265,14 @@ class TFDebertaV2Intermediate(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput with Deberta->DebertaV2
class TFDebertaV2Output(tf.keras.layers.Layer):
class TFDebertaV2Output(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
......@@ -298,7 +299,7 @@ class TFDebertaV2Output(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer with Deberta->DebertaV2
class TFDebertaV2Layer(tf.keras.layers.Layer):
class TFDebertaV2Layer(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
......@@ -349,7 +350,7 @@ class TFDebertaV2Layer(tf.keras.layers.Layer):
self.bert_output.build(None)
class TFDebertaV2ConvLayer(tf.keras.layers.Layer):
class TFDebertaV2ConvLayer(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
......@@ -357,7 +358,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer):
# groups = getattr(config, "conv_groups", 1)
self.conv_act = get_tf_activation(getattr(config, "conv_act", "tanh"))
self.padding = (self.kernel_size - 1) // 2
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config
......@@ -412,7 +413,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer):
return output_states
class TFDebertaV2Encoder(tf.keras.layers.Layer):
class TFDebertaV2Encoder(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
......@@ -433,7 +434,7 @@ class TFDebertaV2Encoder(tf.keras.layers.Layer):
self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
if "layer_norm" in self.norm_rel_ebd:
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None
......@@ -634,7 +635,7 @@ def take_along_axis(x, indices):
return gathered
class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
"""
Disentangled self-attention module
......@@ -656,19 +657,19 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
_attention_head_size = config.hidden_size // config.num_attention_heads
self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query_proj = tf.keras.layers.Dense(
self.query_proj = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="query_proj",
use_bias=True,
)
self.key_proj = tf.keras.layers.Dense(
self.key_proj = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="key_proj",
use_bias=True,
)
self.value_proj = tf.keras.layers.Dense(
self.value_proj = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="value_proj",
......@@ -692,14 +693,14 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
if not self.share_att_key:
if "c2p" in self.pos_att_type:
self.pos_key_proj = tf.keras.layers.Dense(
self.pos_key_proj = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="pos_proj",
use_bias=True,
)
if "p2c" in self.pos_att_type:
self.pos_query_proj = tf.keras.layers.Dense(
self.pos_query_proj = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="pos_q_proj",
......@@ -925,7 +926,7 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaEmbeddings Deberta->DebertaV2
class TFDebertaV2Embeddings(tf.keras.layers.Layer):
class TFDebertaV2Embeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
......@@ -938,13 +939,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
self.position_biased_input = getattr(config, "position_biased_input", True)
self.initializer_range = config.initializer_range
if self.embedding_size != config.hidden_size:
self.embed_proj = tf.keras.layers.Dense(
self.embed_proj = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embed_proj",
use_bias=False,
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
def build(self, input_shape=None):
......@@ -1045,13 +1046,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPredictionHeadTransform with Deberta->DebertaV2
class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer):
class TFDebertaV2PredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=self.embedding_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
......@@ -1061,7 +1062,7 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
......@@ -1084,8 +1085,8 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead with Deberta->DebertaV2
class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs):
class TFDebertaV2LMPredictionHead(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -1107,7 +1108,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer:
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
......@@ -1133,8 +1134,8 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOnlyMLMHead with Deberta->DebertaV2
class TFDebertaV2OnlyMLMHead(tf.keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs):
class TFDebertaV2OnlyMLMHead(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFDebertaV2LMPredictionHead(config, input_embeddings, name="predictions")
......@@ -1153,7 +1154,7 @@ class TFDebertaV2OnlyMLMHead(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer with Deberta->DebertaV2
class TFDebertaV2MainLayer(tf.keras.layers.Layer):
class TFDebertaV2MainLayer(keras.layers.Layer):
config_class = DebertaV2Config
def __init__(self, config: DebertaV2Config, **kwargs):
......@@ -1164,7 +1165,7 @@ class TFDebertaV2MainLayer(tf.keras.layers.Layer):
self.embeddings = TFDebertaV2Embeddings(config, name="embeddings")
self.encoder = TFDebertaV2Encoder(config, name="encoder")
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
......@@ -1264,7 +1265,7 @@ DEBERTA_START_DOCSTRING = r"""
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -1412,7 +1413,7 @@ class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelin
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.mlm = TFDebertaV2OnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
@unpack_inputs
......@@ -1499,7 +1500,7 @@ class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenc
drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = TFDebertaV2StableDropout(drop_out, name="cls_dropout")
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
......@@ -1594,8 +1595,8 @@ class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClass
self.num_labels = config.num_labels
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1678,7 +1679,7 @@ class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsw
self.num_labels = config.num_labels
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.qa_outputs = tf.keras.layers.Dense(
self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
......@@ -1777,9 +1778,9 @@ class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceL
super().__init__(config, *inputs, **kwargs)
self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
self.classifier = tf.keras.layers.Dense(
self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.output_dim = self.pooler.output_dim
......
......@@ -30,6 +30,7 @@ from ....modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -56,7 +57,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
class TFPositionalEmbedding(tf.keras.layers.Layer):
class TFPositionalEmbedding(keras.layers.Layer):
def __init__(self, demb, **kwargs):
super().__init__(**kwargs)
......@@ -73,7 +74,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
return pos_emb[:, None, :]
class TFPositionwiseFF(tf.keras.layers.Layer):
class TFPositionwiseFF(keras.layers.Layer):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super().__init__(**kwargs)
......@@ -81,14 +82,14 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
self.d_inner = d_inner
self.dropout = dropout
self.layer_1 = tf.keras.layers.Dense(
self.layer_1 = keras.layers.Dense(
d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
)
self.drop_1 = tf.keras.layers.Dropout(dropout)
self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
self.drop_2 = tf.keras.layers.Dropout(dropout)
self.drop_1 = keras.layers.Dropout(dropout)
self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
self.drop_2 = keras.layers.Dropout(dropout)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
self.pre_lnorm = pre_lnorm
......@@ -116,7 +117,7 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
return output
class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
class TFRelPartialLearnableMultiHeadAttn(keras.layers.Layer):
def __init__(
self,
n_head,
......@@ -140,17 +141,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self.dropout = dropout
self.output_attentions = output_attentions
self.qkv_net = tf.keras.layers.Dense(
self.qkv_net = keras.layers.Dense(
3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net"
)
self.drop = tf.keras.layers.Dropout(dropout)
self.dropatt = tf.keras.layers.Dropout(dropatt)
self.o_net = tf.keras.layers.Dense(
self.drop = keras.layers.Dropout(dropout)
self.dropatt = keras.layers.Dropout(dropatt)
self.o_net = keras.layers.Dense(
d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net"
)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
self.scale = 1 / (d_head**0.5)
......@@ -163,7 +164,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self.r_r_bias = None
self.r_w_bias = None
self.r_net = tf.keras.layers.Dense(
self.r_net = keras.layers.Dense(
self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net"
)
......@@ -268,7 +269,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
return outputs
class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class TFRelPartialLearnableDecoderLayer(keras.layers.Layer):
def __init__(
self,
n_head,
......@@ -320,7 +321,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
return outputs
class TFTransfoEmbeddings(tf.keras.layers.Layer):
class TFTransfoEmbeddings(keras.layers.Layer):
def __init__(self, vocab_size, emb_size, init_std, **kwargs):
super().__init__(**kwargs)
......@@ -341,7 +342,7 @@ class TFTransfoEmbeddings(tf.keras.layers.Layer):
return tf.gather(self.weight, inputs)
class TFAdaptiveEmbedding(tf.keras.layers.Layer):
class TFAdaptiveEmbedding(keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
super().__init__(**kwargs)
......@@ -418,7 +419,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
@keras_serializable
class TFTransfoXLMainLayer(tf.keras.layers.Layer):
class TFTransfoXLMainLayer(keras.layers.Layer):
config_class = TransfoXLConfig
def __init__(self, config, **kwargs):
......@@ -447,7 +448,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
name="word_emb",
)
self.drop = tf.keras.layers.Dropout(config.dropout)
self.drop = keras.layers.Dropout(config.dropout)
self.n_layer = config.n_layer
self.mem_len = config.mem_len
......@@ -773,7 +774,7 @@ TRANSFO_XL_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -1022,7 +1023,7 @@ class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenc
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.score = tf.keras.layers.Dense(
self.score = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.init_range),
name="score",
......
......@@ -20,10 +20,11 @@
import tensorflow as tf
from ....modeling_tf_utils import keras
from ....tf_utils import shape_list
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
class TFAdaptiveSoftmaxMask(keras.layers.Layer):
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
super().__init__(**kwargs)
......
......@@ -32,6 +32,7 @@ from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
get_initializer,
keras,
unpack_inputs,
)
from ...tf_utils import shape_list
......@@ -77,7 +78,7 @@ ENCODER_DECODER_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -258,7 +259,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
self.encoder.config.hidden_size != self.decoder.config.hidden_size
and self.decoder.config.cross_attention_hidden_size is None
):
self.enc_to_dec_proj = tf.keras.layers.Dense(
self.enc_to_dec_proj = keras.layers.Dense(
units=self.decoder.config.hidden_size,
kernel_initializer=get_initializer(config.encoder.initializer_range),
name="enc_to_dec_proj",
......@@ -445,7 +446,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix
decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
# Make sure these 2 `tf.keras.Model` have fixed names so `from_pretrained` could load model weights correctly.
# Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly.
if encoder.name != "encoder":
raise ValueError("encoder model must be created with the name `encoder`.")
if decoder.name != "decoder":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment