Unverified Commit 415e9a09 authored by Matt's avatar Matt Committed by GitHub
Browse files

Add tf_keras imports to prepare for Keras 3 (#28588)

* Port core files + ESM (because ESM code is odd)

* Search-replace in modelling code

* Fix up transfo_xl as well

* Fix other core files + tests (still need to add correct import to tests)

* Fix cookiecutter

* make fixup, fix imports in some more core files

* Auto-add imports to tests

* Cleanup, add imports to sagemaker tests

* Use correct exception for importing tf_keras

* Fixes in modeling_tf_utils

* make fixup

* Correct version parsing code

* Ensure the pipeline tests correctly revert to float32 after each test

* Ensure the pipeline tests correctly revert to float32 after each test

* More tf.keras -> keras

* Add dtype cast

* Better imports of tf_keras

* Add a cast for tf.assign, just in case

* Fix callback imports
parent 1d489b3e
...@@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( ...@@ -46,6 +46,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -75,7 +76,7 @@ CAMEMBERT_START_DOCSTRING = r""" ...@@ -75,7 +76,7 @@ CAMEMBERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -168,7 +169,7 @@ CAMEMBERT_INPUTS_DOCSTRING = r""" ...@@ -168,7 +169,7 @@ CAMEMBERT_INPUTS_DOCSTRING = r"""
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings
class TFCamembertEmbeddings(tf.keras.layers.Layer): class TFCamembertEmbeddings(keras.layers.Layer):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
""" """
...@@ -181,8 +182,8 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): ...@@ -181,8 +182,8 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -274,11 +275,11 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): ...@@ -274,11 +275,11 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert
class TFCamembertPooler(tf.keras.layers.Layer): class TFCamembertPooler(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -304,7 +305,7 @@ class TFCamembertPooler(tf.keras.layers.Layer): ...@@ -304,7 +305,7 @@ class TFCamembertPooler(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert
class TFCamembertSelfAttention(tf.keras.layers.Layer): class TFCamembertSelfAttention(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -319,16 +320,16 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer): ...@@ -319,16 +320,16 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.config = config self.config = config
...@@ -437,15 +438,15 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer): ...@@ -437,15 +438,15 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert
class TFCamembertSelfOutput(tf.keras.layers.Layer): class TFCamembertSelfOutput(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -468,7 +469,7 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer): ...@@ -468,7 +469,7 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert
class TFCamembertAttention(tf.keras.layers.Layer): class TFCamembertAttention(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -520,11 +521,11 @@ class TFCamembertAttention(tf.keras.layers.Layer): ...@@ -520,11 +521,11 @@ class TFCamembertAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert
class TFCamembertIntermediate(tf.keras.layers.Layer): class TFCamembertIntermediate(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -550,15 +551,15 @@ class TFCamembertIntermediate(tf.keras.layers.Layer): ...@@ -550,15 +551,15 @@ class TFCamembertIntermediate(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert
class TFCamembertOutput(tf.keras.layers.Layer): class TFCamembertOutput(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -581,7 +582,7 @@ class TFCamembertOutput(tf.keras.layers.Layer): ...@@ -581,7 +582,7 @@ class TFCamembertOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert
class TFCamembertLayer(tf.keras.layers.Layer): class TFCamembertLayer(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -685,7 +686,7 @@ class TFCamembertLayer(tf.keras.layers.Layer): ...@@ -685,7 +686,7 @@ class TFCamembertLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert
class TFCamembertEncoder(tf.keras.layers.Layer): class TFCamembertEncoder(keras.layers.Layer):
def __init__(self, config: CamembertConfig, **kwargs): def __init__(self, config: CamembertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -765,7 +766,7 @@ class TFCamembertEncoder(tf.keras.layers.Layer): ...@@ -765,7 +766,7 @@ class TFCamembertEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert
class TFCamembertMainLayer(tf.keras.layers.Layer): class TFCamembertMainLayer(keras.layers.Layer):
config_class = CamembertConfig config_class = CamembertConfig
def __init__(self, config, add_pooling_layer=True, **kwargs): def __init__(self, config, add_pooling_layer=True, **kwargs):
...@@ -785,7 +786,7 @@ class TFCamembertMainLayer(tf.keras.layers.Layer): ...@@ -785,7 +786,7 @@ class TFCamembertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFCamembertEmbeddings(config, name="embeddings") self.embeddings = TFCamembertEmbeddings(config, name="embeddings")
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
...@@ -1068,7 +1069,7 @@ class TFCamembertModel(TFCamembertPreTrainedModel): ...@@ -1068,7 +1069,7 @@ class TFCamembertModel(TFCamembertPreTrainedModel):
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert
class TFCamembertLMHead(tf.keras.layers.Layer): class TFCamembertLMHead(keras.layers.Layer):
"""Camembert Head for masked language modeling.""" """Camembert Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
...@@ -1076,10 +1077,10 @@ class TFCamembertLMHead(tf.keras.layers.Layer): ...@@ -1076,10 +1077,10 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
self.config = config self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.act = get_tf_activation("gelu") self.act = get_tf_activation("gelu")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -1222,12 +1223,12 @@ class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelin ...@@ -1222,12 +1223,12 @@ class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelin
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead
class TFCamembertClassificationHead(tf.keras.layers.Layer): class TFCamembertClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.hidden_size, config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -1236,8 +1237,8 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer): ...@@ -1236,8 +1237,8 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer):
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(classifier_dropout) self.dropout = keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense( self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
self.config = config self.config = config
...@@ -1371,8 +1372,8 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass ...@@ -1371,8 +1372,8 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(classifier_dropout) self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1463,8 +1464,8 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL ...@@ -1463,8 +1464,8 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFCamembertMainLayer(config, name="roberta") self.roberta = TFCamembertMainLayer(config, name="roberta")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1568,7 +1569,7 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw ...@@ -1568,7 +1569,7 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
......
...@@ -32,6 +32,7 @@ from ...modeling_tf_utils import ( ...@@ -32,6 +32,7 @@ from ...modeling_tf_utils import (
TFModelInputType, TFModelInputType,
TFPreTrainedModel, TFPreTrainedModel,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -77,7 +78,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -77,7 +78,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
return tf.math.reduce_mean( return tf.math.reduce_mean(
tf.keras.metrics.sparse_categorical_crossentropy( keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
) )
) )
...@@ -127,7 +128,7 @@ class TFCLIPOutput(ModelOutput): ...@@ -127,7 +128,7 @@ class TFCLIPOutput(ModelOutput):
) )
class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): class TFCLIPVisionEmbeddings(keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs): def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -140,7 +141,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): ...@@ -140,7 +141,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
self.config = config self.config = config
self.patch_embedding = tf.keras.layers.Conv2D( self.patch_embedding = keras.layers.Conv2D(
filters=self.embed_dim, filters=self.embed_dim,
kernel_size=self.patch_size, kernel_size=self.patch_size,
strides=self.patch_size, strides=self.patch_size,
...@@ -201,7 +202,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): ...@@ -201,7 +202,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
return embeddings return embeddings
class TFCLIPTextEmbeddings(tf.keras.layers.Layer): class TFCLIPTextEmbeddings(keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs): def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -259,7 +260,7 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer): ...@@ -259,7 +260,7 @@ class TFCLIPTextEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFCLIPAttention(tf.keras.layers.Layer): class TFCLIPAttention(keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: CLIPConfig, **kwargs): def __init__(self, config: CLIPConfig, **kwargs):
...@@ -280,19 +281,19 @@ class TFCLIPAttention(tf.keras.layers.Layer): ...@@ -280,19 +281,19 @@ class TFCLIPAttention(tf.keras.layers.Layer):
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.q_proj = tf.keras.layers.Dense( self.q_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj" units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
) )
self.k_proj = tf.keras.layers.Dense( self.k_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj" units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
) )
self.v_proj = tf.keras.layers.Dense( self.v_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj" units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout) self.dropout = keras.layers.Dropout(rate=config.attention_dropout)
self.out_proj = tf.keras.layers.Dense( self.out_proj = keras.layers.Dense(
units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj" units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
) )
...@@ -375,7 +376,7 @@ class TFCLIPAttention(tf.keras.layers.Layer): ...@@ -375,7 +376,7 @@ class TFCLIPAttention(tf.keras.layers.Layer):
self.out_proj.build([None, None, self.embed_dim]) self.out_proj.build([None, None, self.embed_dim])
class TFCLIPMLP(tf.keras.layers.Layer): class TFCLIPMLP(keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs): def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -385,10 +386,10 @@ class TFCLIPMLP(tf.keras.layers.Layer): ...@@ -385,10 +386,10 @@ class TFCLIPMLP(tf.keras.layers.Layer):
in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
fc_std = (2 * config.hidden_size) ** -0.5 * factor fc_std = (2 * config.hidden_size) ** -0.5 * factor
self.fc1 = tf.keras.layers.Dense( self.fc1 = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
) )
self.fc2 = tf.keras.layers.Dense( self.fc2 = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
) )
self.config = config self.config = config
...@@ -411,15 +412,15 @@ class TFCLIPMLP(tf.keras.layers.Layer): ...@@ -411,15 +412,15 @@ class TFCLIPMLP(tf.keras.layers.Layer):
self.fc2.build([None, None, self.config.intermediate_size]) self.fc2.build([None, None, self.config.intermediate_size])
class TFCLIPEncoderLayer(tf.keras.layers.Layer): class TFCLIPEncoderLayer(keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs): def __init__(self, config: CLIPConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = TFCLIPAttention(config, name="self_attn") self.self_attn = TFCLIPAttention(config, name="self_attn")
self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFCLIPMLP(config, name="mlp") self.mlp = TFCLIPMLP(config, name="mlp")
self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call( def call(
self, self,
...@@ -480,7 +481,7 @@ class TFCLIPEncoderLayer(tf.keras.layers.Layer): ...@@ -480,7 +481,7 @@ class TFCLIPEncoderLayer(tf.keras.layers.Layer):
self.layer_norm2.build([None, None, self.embed_dim]) self.layer_norm2.build([None, None, self.embed_dim])
class TFCLIPEncoder(tf.keras.layers.Layer): class TFCLIPEncoder(keras.layers.Layer):
""" """
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`TFCLIPEncoderLayer`]. [`TFCLIPEncoderLayer`].
...@@ -544,15 +545,13 @@ class TFCLIPEncoder(tf.keras.layers.Layer): ...@@ -544,15 +545,13 @@ class TFCLIPEncoder(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFCLIPTextTransformer(tf.keras.layers.Layer): class TFCLIPTextTransformer(keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs): def __init__(self, config: CLIPTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings") self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings")
self.encoder = TFCLIPEncoder(config, name="encoder") self.encoder = TFCLIPEncoder(config, name="encoder")
self.final_layer_norm = tf.keras.layers.LayerNormalization( self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
epsilon=config.layer_norm_eps, name="final_layer_norm"
)
# For `pooled_output` computation # For `pooled_output` computation
self.eos_token_id = config.eos_token_id self.eos_token_id = config.eos_token_id
...@@ -663,7 +662,7 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer): ...@@ -663,7 +662,7 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFCLIPTextMainLayer(tf.keras.layers.Layer): class TFCLIPTextMainLayer(keras.layers.Layer):
config_class = CLIPTextConfig config_class = CLIPTextConfig
def __init__(self, config: CLIPTextConfig, **kwargs): def __init__(self, config: CLIPTextConfig, **kwargs):
...@@ -671,7 +670,7 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer): ...@@ -671,7 +670,7 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
self.text_model = TFCLIPTextTransformer(config, name="text_model") self.text_model = TFCLIPTextTransformer(config, name="text_model")
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.text_model.embeddings return self.text_model.embeddings
def set_input_embeddings(self, value: tf.Variable): def set_input_embeddings(self, value: tf.Variable):
...@@ -718,14 +717,14 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer): ...@@ -718,14 +717,14 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer):
self.text_model.build(None) self.text_model.build(None)
class TFCLIPVisionTransformer(tf.keras.layers.Layer): class TFCLIPVisionTransformer(keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs): def __init__(self, config: CLIPVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings") self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings")
self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm") self.pre_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
self.encoder = TFCLIPEncoder(config, name="encoder") self.encoder = TFCLIPEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
def call( def call(
...@@ -782,7 +781,7 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer): ...@@ -782,7 +781,7 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFCLIPVisionMainLayer(tf.keras.layers.Layer): class TFCLIPVisionMainLayer(keras.layers.Layer):
config_class = CLIPVisionConfig config_class = CLIPVisionConfig
def __init__(self, config: CLIPVisionConfig, **kwargs): def __init__(self, config: CLIPVisionConfig, **kwargs):
...@@ -790,7 +789,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer): ...@@ -790,7 +789,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
self.vision_model = TFCLIPVisionTransformer(config, name="vision_model") self.vision_model = TFCLIPVisionTransformer(config, name="vision_model")
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings return self.vision_model.embeddings
@unpack_inputs @unpack_inputs
...@@ -825,7 +824,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer): ...@@ -825,7 +824,7 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFCLIPMainLayer(tf.keras.layers.Layer): class TFCLIPMainLayer(keras.layers.Layer):
config_class = CLIPConfig config_class = CLIPConfig
def __init__(self, config: CLIPConfig, **kwargs): def __init__(self, config: CLIPConfig, **kwargs):
...@@ -853,14 +852,14 @@ class TFCLIPMainLayer(tf.keras.layers.Layer): ...@@ -853,14 +852,14 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
self.text_model = TFCLIPTextTransformer(text_config, name="text_model") self.text_model = TFCLIPTextTransformer(text_config, name="text_model")
self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model") self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model")
self.visual_projection = tf.keras.layers.Dense( self.visual_projection = keras.layers.Dense(
units=self.projection_dim, units=self.projection_dim,
kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor), kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor),
use_bias=False, use_bias=False,
name="visual_projection", name="visual_projection",
) )
self.text_projection = tf.keras.layers.Dense( self.text_projection = keras.layers.Dense(
units=self.projection_dim, units=self.projection_dim,
kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor), kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor),
use_bias=False, use_bias=False,
...@@ -872,7 +871,7 @@ class TFCLIPMainLayer(tf.keras.layers.Layer): ...@@ -872,7 +871,7 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
def build(self, input_shape: tf.TensorShape = None): def build(self, input_shape: tf.TensorShape = None):
self.logit_scale = self.add_weight( self.logit_scale = self.add_weight(
shape=(1,), shape=(1,),
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True, trainable=True,
name="logit_scale", name="logit_scale",
) )
...@@ -1046,7 +1045,7 @@ CLIP_START_DOCSTRING = r""" ...@@ -1046,7 +1045,7 @@ CLIP_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
......
...@@ -41,6 +41,7 @@ from ...modeling_tf_utils import ( ...@@ -41,6 +41,7 @@ from ...modeling_tf_utils import (
TFSequenceSummary, TFSequenceSummary,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -68,7 +69,7 @@ TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -68,7 +69,7 @@ TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->ConvBert # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->ConvBert
class TFConvBertEmbeddings(tf.keras.layers.Layer): class TFConvBertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: ConvBertConfig, **kwargs): def __init__(self, config: ConvBertConfig, **kwargs):
...@@ -78,8 +79,8 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ...@@ -78,8 +79,8 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -152,7 +153,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ...@@ -152,7 +153,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFConvBertSelfAttention(tf.keras.layers.Layer): class TFConvBertSelfAttention(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -178,17 +179,17 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): ...@@ -178,17 +179,17 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
self.attention_head_size = config.hidden_size // config.num_attention_heads self.attention_head_size = config.hidden_size // config.num_attention_heads
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.key_conv_attn_layer = tf.keras.layers.SeparableConv1D( self.key_conv_attn_layer = keras.layers.SeparableConv1D(
self.all_head_size, self.all_head_size,
self.conv_kernel_size, self.conv_kernel_size,
padding="same", padding="same",
...@@ -198,21 +199,21 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): ...@@ -198,21 +199,21 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
name="key_conv_attn_layer", name="key_conv_attn_layer",
) )
self.conv_kernel_layer = tf.keras.layers.Dense( self.conv_kernel_layer = keras.layers.Dense(
self.num_attention_heads * self.conv_kernel_size, self.num_attention_heads * self.conv_kernel_size,
activation=None, activation=None,
name="conv_kernel_layer", name="conv_kernel_layer",
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
) )
self.conv_out_layer = tf.keras.layers.Dense( self.conv_out_layer = keras.layers.Dense(
self.all_head_size, self.all_head_size,
activation=None, activation=None,
name="conv_out_layer", name="conv_out_layer",
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
) )
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.config = config self.config = config
def transpose_for_scores(self, x, batch_size): def transpose_for_scores(self, x, batch_size):
...@@ -327,15 +328,15 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): ...@@ -327,15 +328,15 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
self.conv_out_layer.build([None, None, self.config.hidden_size]) self.conv_out_layer.build([None, None, self.config.hidden_size])
class TFConvBertSelfOutput(tf.keras.layers.Layer): class TFConvBertSelfOutput(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states, input_tensor, training=False):
...@@ -357,7 +358,7 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer): ...@@ -357,7 +358,7 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFConvBertAttention(tf.keras.layers.Layer): class TFConvBertAttention(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -388,7 +389,7 @@ class TFConvBertAttention(tf.keras.layers.Layer): ...@@ -388,7 +389,7 @@ class TFConvBertAttention(tf.keras.layers.Layer):
self.dense_output.build(None) self.dense_output.build(None)
class GroupedLinearLayer(tf.keras.layers.Layer): class GroupedLinearLayer(keras.layers.Layer):
def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs): def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.input_size = input_size self.input_size = input_size
...@@ -421,11 +422,11 @@ class GroupedLinearLayer(tf.keras.layers.Layer): ...@@ -421,11 +422,11 @@ class GroupedLinearLayer(tf.keras.layers.Layer):
return x return x
class TFConvBertIntermediate(tf.keras.layers.Layer): class TFConvBertIntermediate(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.num_groups == 1: if config.num_groups == 1:
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
else: else:
...@@ -458,12 +459,12 @@ class TFConvBertIntermediate(tf.keras.layers.Layer): ...@@ -458,12 +459,12 @@ class TFConvBertIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFConvBertOutput(tf.keras.layers.Layer): class TFConvBertOutput(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.num_groups == 1: if config.num_groups == 1:
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
else: else:
...@@ -474,8 +475,8 @@ class TFConvBertOutput(tf.keras.layers.Layer): ...@@ -474,8 +475,8 @@ class TFConvBertOutput(tf.keras.layers.Layer):
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states, input_tensor, training=False):
...@@ -497,7 +498,7 @@ class TFConvBertOutput(tf.keras.layers.Layer): ...@@ -497,7 +498,7 @@ class TFConvBertOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.intermediate_size]) self.dense.build([None, None, self.config.intermediate_size])
class TFConvBertLayer(tf.keras.layers.Layer): class TFConvBertLayer(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -531,7 +532,7 @@ class TFConvBertLayer(tf.keras.layers.Layer): ...@@ -531,7 +532,7 @@ class TFConvBertLayer(tf.keras.layers.Layer):
self.bert_output.build(None) self.bert_output.build(None)
class TFConvBertEncoder(tf.keras.layers.Layer): class TFConvBertEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -583,11 +584,11 @@ class TFConvBertEncoder(tf.keras.layers.Layer): ...@@ -583,11 +584,11 @@ class TFConvBertEncoder(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): class TFConvBertPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -596,7 +597,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -596,7 +597,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config self.config = config
def call(self, hidden_states): def call(self, hidden_states):
...@@ -619,7 +620,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -619,7 +620,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFConvBertMainLayer(tf.keras.layers.Layer): class TFConvBertMainLayer(keras.layers.Layer):
config_class = ConvBertConfig config_class = ConvBertConfig
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -628,7 +629,7 @@ class TFConvBertMainLayer(tf.keras.layers.Layer): ...@@ -628,7 +629,7 @@ class TFConvBertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFConvBertEmbeddings(config, name="embeddings") self.embeddings = TFConvBertEmbeddings(config, name="embeddings")
if config.embedding_size != config.hidden_size: if config.embedding_size != config.hidden_size:
self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project") self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")
self.encoder = TFConvBertEncoder(config, name="encoder") self.encoder = TFConvBertEncoder(config, name="encoder")
self.config = config self.config = config
...@@ -755,7 +756,7 @@ CONVBERT_START_DOCSTRING = r""" ...@@ -755,7 +756,7 @@ CONVBERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -901,7 +902,7 @@ class TFConvBertModel(TFConvBertPreTrainedModel): ...@@ -901,7 +902,7 @@ class TFConvBertModel(TFConvBertPreTrainedModel):
self.convbert.build(None) self.convbert.build(None)
class TFConvBertMaskedLMHead(tf.keras.layers.Layer): class TFConvBertMaskedLMHead(keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -938,12 +939,12 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer): ...@@ -938,12 +939,12 @@ class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFConvBertGeneratorPredictions(tf.keras.layers.Layer): class TFConvBertGeneratorPredictions(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") self.dense = keras.layers.Dense(config.embedding_size, name="dense")
self.config = config self.config = config
def call(self, generator_hidden_states, training=False): def call(self, generator_hidden_states, training=False):
...@@ -1058,20 +1059,20 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL ...@@ -1058,20 +1059,20 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
self.generator_lm_head.build(None) self.generator_lm_head.build(None)
class TFConvBertClassificationHead(tf.keras.layers.Layer): class TFConvBertClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(classifier_dropout) self.dropout = keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense( self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
...@@ -1193,7 +1194,7 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos ...@@ -1193,7 +1194,7 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos
self.sequence_summary = TFSequenceSummary( self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary" config, initializer_range=config.initializer_range, name="sequence_summary"
) )
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1302,8 +1303,8 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif ...@@ -1302,8 +1303,8 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(classifier_dropout) self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1386,7 +1387,7 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer ...@@ -1386,7 +1387,7 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.convbert = TFConvBertMainLayer(config, name="convbert") self.convbert = TFConvBertMainLayer(config, name="convbert")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
......
...@@ -29,6 +29,7 @@ from ...modeling_tf_utils import ( ...@@ -29,6 +29,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -44,7 +45,7 @@ _CONFIG_FOR_DOC = "ConvNextConfig" ...@@ -44,7 +45,7 @@ _CONFIG_FOR_DOC = "ConvNextConfig"
_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224" _CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
class TFConvNextDropPath(tf.keras.layers.Layer): class TFConvNextDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References: References:
(1) github.com:rwightman/pytorch-image-models (1) github.com:rwightman/pytorch-image-models
...@@ -64,22 +65,22 @@ class TFConvNextDropPath(tf.keras.layers.Layer): ...@@ -64,22 +65,22 @@ class TFConvNextDropPath(tf.keras.layers.Layer):
return x return x
class TFConvNextEmbeddings(tf.keras.layers.Layer): class TFConvNextEmbeddings(keras.layers.Layer):
"""This class is comparable to (and inspired by) the SwinEmbeddings class """This class is comparable to (and inspired by) the SwinEmbeddings class
found in src/transformers/models/swin/modeling_swin.py. found in src/transformers/models/swin/modeling_swin.py.
""" """
def __init__(self, config: ConvNextConfig, **kwargs): def __init__(self, config: ConvNextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.patch_embeddings = tf.keras.layers.Conv2D( self.patch_embeddings = keras.layers.Conv2D(
filters=config.hidden_sizes[0], filters=config.hidden_sizes[0],
kernel_size=config.patch_size, kernel_size=config.patch_size,
strides=config.patch_size, strides=config.patch_size,
name="patch_embeddings", name="patch_embeddings",
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
) )
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels self.num_channels = config.num_channels
self.config = config self.config = config
...@@ -93,7 +94,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): ...@@ -93,7 +94,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
) )
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
# So change the input format from `NCHW` to `NHWC`. # So change the input format from `NCHW` to `NHWC`.
# shape = (batch_size, in_height, in_width, in_channels) # shape = (batch_size, in_height, in_width, in_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
...@@ -114,7 +115,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): ...@@ -114,7 +115,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]]) self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextLayer(tf.keras.layers.Layer): class TFConvNextLayer(keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation. """This corresponds to the `Block` class in the original implementation.
There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
...@@ -133,7 +134,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): ...@@ -133,7 +134,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dim = dim self.dim = dim
self.config = config self.config = config
self.dwconv = tf.keras.layers.Conv2D( self.dwconv = keras.layers.Conv2D(
filters=dim, filters=dim,
kernel_size=7, kernel_size=7,
padding="same", padding="same",
...@@ -142,18 +143,18 @@ class TFConvNextLayer(tf.keras.layers.Layer): ...@@ -142,18 +143,18 @@ class TFConvNextLayer(tf.keras.layers.Layer):
bias_initializer="zeros", bias_initializer="zeros",
name="dwconv", name="dwconv",
) # depthwise conv ) # depthwise conv
self.layernorm = tf.keras.layers.LayerNormalization( self.layernorm = keras.layers.LayerNormalization(
epsilon=1e-6, epsilon=1e-6,
name="layernorm", name="layernorm",
) )
self.pwconv1 = tf.keras.layers.Dense( self.pwconv1 = keras.layers.Dense(
units=4 * dim, units=4 * dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros", bias_initializer="zeros",
name="pwconv1", name="pwconv1",
) # pointwise/1x1 convs, implemented with linear layers ) # pointwise/1x1 convs, implemented with linear layers
self.act = get_tf_activation(config.hidden_act) self.act = get_tf_activation(config.hidden_act)
self.pwconv2 = tf.keras.layers.Dense( self.pwconv2 = keras.layers.Dense(
units=dim, units=dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros", bias_initializer="zeros",
...@@ -164,7 +165,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): ...@@ -164,7 +165,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
self.drop_path = ( self.drop_path = (
TFConvNextDropPath(drop_path, name="drop_path") TFConvNextDropPath(drop_path, name="drop_path")
if drop_path > 0.0 if drop_path > 0.0
else tf.keras.layers.Activation("linear", name="drop_path") else keras.layers.Activation("linear", name="drop_path")
) )
def build(self, input_shape: tf.TensorShape = None): def build(self, input_shape: tf.TensorShape = None):
...@@ -172,7 +173,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): ...@@ -172,7 +173,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
self.layer_scale_parameter = ( self.layer_scale_parameter = (
self.add_weight( self.add_weight(
shape=(self.dim,), shape=(self.dim,),
initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value), initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
trainable=True, trainable=True,
name="layer_scale_parameter", name="layer_scale_parameter",
) )
...@@ -214,7 +215,7 @@ class TFConvNextLayer(tf.keras.layers.Layer): ...@@ -214,7 +215,7 @@ class TFConvNextLayer(tf.keras.layers.Layer):
return x return x
class TFConvNextStage(tf.keras.layers.Layer): class TFConvNextStage(keras.layers.Layer):
"""ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks. """ConvNext stage, consisting of an optional downsampling layer + multiple residual blocks.
Args: Args:
...@@ -244,7 +245,7 @@ class TFConvNextStage(tf.keras.layers.Layer): ...@@ -244,7 +245,7 @@ class TFConvNextStage(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
if in_channels != out_channels or stride > 1: if in_channels != out_channels or stride > 1:
self.downsampling_layer = [ self.downsampling_layer = [
tf.keras.layers.LayerNormalization( keras.layers.LayerNormalization(
epsilon=1e-6, epsilon=1e-6,
name="downsampling_layer.0", name="downsampling_layer.0",
), ),
...@@ -253,12 +254,12 @@ class TFConvNextStage(tf.keras.layers.Layer): ...@@ -253,12 +254,12 @@ class TFConvNextStage(tf.keras.layers.Layer):
# layer. All the outputs throughout the model will be in NHWC # layer. All the outputs throughout the model will be in NHWC
# from this point on until the output where we again change to # from this point on until the output where we again change to
# NCHW. # NCHW.
tf.keras.layers.Conv2D( keras.layers.Conv2D(
filters=out_channels, filters=out_channels,
kernel_size=kernel_size, kernel_size=kernel_size,
strides=stride, strides=stride,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
name="downsampling_layer.1", name="downsampling_layer.1",
), ),
] ]
...@@ -301,7 +302,7 @@ class TFConvNextStage(tf.keras.layers.Layer): ...@@ -301,7 +302,7 @@ class TFConvNextStage(tf.keras.layers.Layer):
self.downsampling_layer[1].build([None, None, None, self.in_channels]) self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextEncoder(tf.keras.layers.Layer): class TFConvNextEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.stages = [] self.stages = []
...@@ -347,7 +348,7 @@ class TFConvNextEncoder(tf.keras.layers.Layer): ...@@ -347,7 +348,7 @@ class TFConvNextEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFConvNextMainLayer(tf.keras.layers.Layer): class TFConvNextMainLayer(keras.layers.Layer):
config_class = ConvNextConfig config_class = ConvNextConfig
def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs): def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwargs):
...@@ -356,10 +357,10 @@ class TFConvNextMainLayer(tf.keras.layers.Layer): ...@@ -356,10 +357,10 @@ class TFConvNextMainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
self.embeddings = TFConvNextEmbeddings(config, name="embeddings") self.embeddings = TFConvNextEmbeddings(config, name="embeddings")
self.encoder = TFConvNextEncoder(config, name="encoder") self.encoder = TFConvNextEncoder(config, name="encoder")
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# We are setting the `data_format` like so because from here on we will revert to the # We are setting the `data_format` like so because from here on we will revert to the
# NCHW output format # NCHW output format
self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_first") if add_pooling_layer else None
@unpack_inputs @unpack_inputs
def call( def call(
...@@ -436,7 +437,7 @@ CONVNEXT_START_DOCSTRING = r""" ...@@ -436,7 +437,7 @@ CONVNEXT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -575,7 +576,7 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas ...@@ -575,7 +576,7 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas
self.convnext = TFConvNextMainLayer(config, name="convnext") self.convnext = TFConvNextMainLayer(config, name="convnext")
# Classifier head # Classifier head
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer="zeros", bias_initializer="zeros",
......
...@@ -34,6 +34,7 @@ from ...modeling_tf_utils import ( ...@@ -34,6 +34,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -67,7 +68,7 @@ CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -67,7 +68,7 @@ CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->ConvNextV2 # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->ConvNextV2
class TFConvNextV2DropPath(tf.keras.layers.Layer): class TFConvNextV2DropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References: References:
(1) github.com:rwightman/pytorch-image-models (1) github.com:rwightman/pytorch-image-models
...@@ -87,7 +88,7 @@ class TFConvNextV2DropPath(tf.keras.layers.Layer): ...@@ -87,7 +88,7 @@ class TFConvNextV2DropPath(tf.keras.layers.Layer):
return x return x
class TFConvNextV2GRN(tf.keras.layers.Layer): class TFConvNextV2GRN(keras.layers.Layer):
"""GRN (Global Response Normalization) layer""" """GRN (Global Response Normalization) layer"""
def __init__(self, config: ConvNextV2Config, dim: int, **kwargs): def __init__(self, config: ConvNextV2Config, dim: int, **kwargs):
...@@ -99,12 +100,12 @@ class TFConvNextV2GRN(tf.keras.layers.Layer): ...@@ -99,12 +100,12 @@ class TFConvNextV2GRN(tf.keras.layers.Layer):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
shape=(1, 1, 1, self.dim), shape=(1, 1, 1, self.dim),
initializer=tf.keras.initializers.Zeros(), initializer=keras.initializers.Zeros(),
) )
self.bias = self.add_weight( self.bias = self.add_weight(
name="bias", name="bias",
shape=(1, 1, 1, self.dim), shape=(1, 1, 1, self.dim),
initializer=tf.keras.initializers.Zeros(), initializer=keras.initializers.Zeros(),
) )
return super().build(input_shape) return super().build(input_shape)
...@@ -116,22 +117,22 @@ class TFConvNextV2GRN(tf.keras.layers.Layer): ...@@ -116,22 +117,22 @@ class TFConvNextV2GRN(tf.keras.layers.Layer):
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextEmbeddings with ConvNext->ConvNextV2 # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextEmbeddings with ConvNext->ConvNextV2
class TFConvNextV2Embeddings(tf.keras.layers.Layer): class TFConvNextV2Embeddings(keras.layers.Layer):
"""This class is comparable to (and inspired by) the SwinEmbeddings class """This class is comparable to (and inspired by) the SwinEmbeddings class
found in src/transformers/models/swin/modeling_swin.py. found in src/transformers/models/swin/modeling_swin.py.
""" """
def __init__(self, config: ConvNextV2Config, **kwargs): def __init__(self, config: ConvNextV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.patch_embeddings = tf.keras.layers.Conv2D( self.patch_embeddings = keras.layers.Conv2D(
filters=config.hidden_sizes[0], filters=config.hidden_sizes[0],
kernel_size=config.patch_size, kernel_size=config.patch_size,
strides=config.patch_size, strides=config.patch_size,
name="patch_embeddings", name="patch_embeddings",
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
) )
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels self.num_channels = config.num_channels
self.config = config self.config = config
...@@ -145,7 +146,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer): ...@@ -145,7 +146,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
) )
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
# So change the input format from `NCHW` to `NHWC`. # So change the input format from `NCHW` to `NHWC`.
# shape = (batch_size, in_height, in_width, in_channels) # shape = (batch_size, in_height, in_width, in_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
...@@ -166,7 +167,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer): ...@@ -166,7 +167,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]]) self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextV2Layer(tf.keras.layers.Layer): class TFConvNextV2Layer(keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation. """This corresponds to the `Block` class in the original implementation.
There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C, There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
...@@ -188,31 +189,31 @@ class TFConvNextV2Layer(tf.keras.layers.Layer): ...@@ -188,31 +189,31 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dim = dim self.dim = dim
self.config = config self.config = config
self.dwconv = tf.keras.layers.Conv2D( self.dwconv = keras.layers.Conv2D(
filters=dim, filters=dim,
kernel_size=7, kernel_size=7,
padding="same", padding="same",
groups=dim, groups=dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
name="dwconv", name="dwconv",
) # depthwise conv ) # depthwise conv
self.layernorm = tf.keras.layers.LayerNormalization( self.layernorm = keras.layers.LayerNormalization(
epsilon=1e-6, epsilon=1e-6,
name="layernorm", name="layernorm",
) )
self.pwconv1 = tf.keras.layers.Dense( self.pwconv1 = keras.layers.Dense(
units=4 * dim, units=4 * dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
name="pwconv1", name="pwconv1",
) # pointwise/1x1 convs, implemented with linear layers ) # pointwise/1x1 convs, implemented with linear layers
self.act = get_tf_activation(config.hidden_act) self.act = get_tf_activation(config.hidden_act)
self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn") self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn")
self.pwconv2 = tf.keras.layers.Dense( self.pwconv2 = keras.layers.Dense(
units=dim, units=dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
name="pwconv2", name="pwconv2",
) )
# Using `layers.Activation` instead of `tf.identity` to better control `training` # Using `layers.Activation` instead of `tf.identity` to better control `training`
...@@ -220,7 +221,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer): ...@@ -220,7 +221,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
self.drop_path = ( self.drop_path = (
TFConvNextV2DropPath(drop_path, name="drop_path") TFConvNextV2DropPath(drop_path, name="drop_path")
if drop_path > 0.0 if drop_path > 0.0
else tf.keras.layers.Activation("linear", name="drop_path") else keras.layers.Activation("linear", name="drop_path")
) )
def call(self, hidden_states, training=False): def call(self, hidden_states, training=False):
...@@ -260,7 +261,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer): ...@@ -260,7 +261,7 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2 # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2
class TFConvNextV2Stage(tf.keras.layers.Layer): class TFConvNextV2Stage(keras.layers.Layer):
"""ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks. """ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks.
Args: Args:
...@@ -290,7 +291,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): ...@@ -290,7 +291,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
if in_channels != out_channels or stride > 1: if in_channels != out_channels or stride > 1:
self.downsampling_layer = [ self.downsampling_layer = [
tf.keras.layers.LayerNormalization( keras.layers.LayerNormalization(
epsilon=1e-6, epsilon=1e-6,
name="downsampling_layer.0", name="downsampling_layer.0",
), ),
...@@ -299,12 +300,12 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): ...@@ -299,12 +300,12 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
# layer. All the outputs throughout the model will be in NHWC # layer. All the outputs throughout the model will be in NHWC
# from this point on until the output where we again change to # from this point on until the output where we again change to
# NCHW. # NCHW.
tf.keras.layers.Conv2D( keras.layers.Conv2D(
filters=out_channels, filters=out_channels,
kernel_size=kernel_size, kernel_size=kernel_size,
strides=stride, strides=stride,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
name="downsampling_layer.1", name="downsampling_layer.1",
), ),
] ]
...@@ -347,7 +348,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): ...@@ -347,7 +348,7 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
self.downsampling_layer[1].build([None, None, None, self.in_channels]) self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextV2Encoder(tf.keras.layers.Layer): class TFConvNextV2Encoder(keras.layers.Layer):
def __init__(self, config: ConvNextV2Config, **kwargs): def __init__(self, config: ConvNextV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.stages = [] self.stages = []
...@@ -398,7 +399,7 @@ class TFConvNextV2Encoder(tf.keras.layers.Layer): ...@@ -398,7 +399,7 @@ class TFConvNextV2Encoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFConvNextV2MainLayer(tf.keras.layers.Layer): class TFConvNextV2MainLayer(keras.layers.Layer):
config_class = ConvNextV2Config config_class = ConvNextV2Config
def __init__(self, config: ConvNextV2Config, **kwargs): def __init__(self, config: ConvNextV2Config, **kwargs):
...@@ -407,10 +408,10 @@ class TFConvNextV2MainLayer(tf.keras.layers.Layer): ...@@ -407,10 +408,10 @@ class TFConvNextV2MainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
self.embeddings = TFConvNextV2Embeddings(config, name="embeddings") self.embeddings = TFConvNextV2Embeddings(config, name="embeddings")
self.encoder = TFConvNextV2Encoder(config, name="encoder") self.encoder = TFConvNextV2Encoder(config, name="encoder")
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# We are setting the `data_format` like so because from here on we will revert to the # We are setting the `data_format` like so because from here on we will revert to the
# NCHW output format # NCHW output format
self.pooler = tf.keras.layers.GlobalAvgPool2D(data_format="channels_last") self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_last")
@unpack_inputs @unpack_inputs
def call( def call(
...@@ -489,7 +490,7 @@ CONVNEXTV2_START_DOCSTRING = r""" ...@@ -489,7 +490,7 @@ CONVNEXTV2_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -614,10 +615,10 @@ class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequence ...@@ -614,10 +615,10 @@ class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequence
self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2") self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2")
# Classifier head # Classifier head
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
bias_initializer=tf.keras.initializers.Zeros(), bias_initializer=keras.initializers.Zeros(),
name="classifier", name="classifier",
) )
......
...@@ -29,6 +29,7 @@ from ...modeling_tf_utils import ( ...@@ -29,6 +29,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -90,7 +91,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N ...@@ -90,7 +91,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
return output, attention_weights return output, attention_weights
class TFMultiHeadAttention(tf.keras.layers.Layer): class TFMultiHeadAttention(keras.layers.Layer):
def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.num_heads = num_heads self.num_heads = num_heads
...@@ -99,11 +100,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): ...@@ -99,11 +100,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self.depth = int(d_model_size / self.num_heads) self.depth = int(d_model_size / self.num_heads)
self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq") self.Wq = keras.layers.Dense(d_model_size, name="Wq")
self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk") self.Wk = keras.layers.Dense(d_model_size, name="Wk")
self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv") self.Wv = keras.layers.Dense(d_model_size, name="Wv")
self.dense = tf.keras.layers.Dense(d_model_size, name="dense") self.dense = keras.layers.Dense(d_model_size, name="dense")
def split_into_heads(self, x, batch_size): def split_into_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
...@@ -160,12 +161,12 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): ...@@ -160,12 +161,12 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self.dense.build([None, None, self.d_model_size]) self.dense.build([None, None, self.d_model_size])
class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): class TFPointWiseFeedForwardLayer(keras.layers.Layer):
def __init__(self, d_model_size, dff, **kwargs): def __init__(self, d_model_size, dff, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense_0 = tf.keras.layers.Dense(dff, activation="relu", name="0") self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0")
self.dense_2 = tf.keras.layers.Dense(d_model_size, name="2") self.dense_2 = keras.layers.Dense(d_model_size, name="2")
self.d_model_size = d_model_size self.d_model_size = d_model_size
self.dff = dff self.dff = dff
...@@ -187,7 +188,7 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): ...@@ -187,7 +188,7 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
self.dense_2.build([None, None, self.dff]) self.dense_2.build([None, None, self.dff])
class TFEncoderLayer(tf.keras.layers.Layer): class TFEncoderLayer(keras.layers.Layer):
def __init__( def __init__(
self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
): ):
...@@ -200,11 +201,11 @@ class TFEncoderLayer(tf.keras.layers.Layer): ...@@ -200,11 +201,11 @@ class TFEncoderLayer(tf.keras.layers.Layer):
) )
self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn") self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn")
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout2 = keras.layers.Dropout(rate)
self.d_model_size = d_model_size self.d_model_size = d_model_size
def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
...@@ -252,7 +253,7 @@ class TFEncoderLayer(tf.keras.layers.Layer): ...@@ -252,7 +253,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFCTRLMainLayer(tf.keras.layers.Layer): class TFCTRLMainLayer(keras.layers.Layer):
config_class = CTRLConfig config_class = CTRLConfig
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -269,14 +270,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -269,14 +270,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size) self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
self.w = tf.keras.layers.Embedding( self.w = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.n_embd, output_dim=config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range), embeddings_initializer=get_initializer(config.initializer_range),
name="w", name="w",
) )
self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) self.dropout = keras.layers.Dropout(config.embd_pdrop)
self.h = [ self.h = [
TFEncoderLayer( TFEncoderLayer(
config.n_embd, config.n_embd,
...@@ -289,7 +290,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -289,7 +290,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
) )
for i in range(config.n_layer) for i in range(config.n_layer)
] ]
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
def get_input_embeddings(self): def get_input_embeddings(self):
return self.w return self.w
...@@ -476,7 +477,7 @@ CTRL_START_DOCSTRING = r""" ...@@ -476,7 +477,7 @@ CTRL_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -635,9 +636,9 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -635,9 +636,9 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
self.transformer.build(None) self.transformer.build(None)
class TFCTRLBiasLayer(tf.keras.layers.Layer): class TFCTRLBiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
...@@ -812,7 +813,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific ...@@ -812,7 +813,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
config.num_labels, config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
......
...@@ -29,6 +29,7 @@ from ...modeling_tf_utils import ( ...@@ -29,6 +29,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -80,7 +81,7 @@ class TFBaseModelOutputWithCLSToken(ModelOutput): ...@@ -80,7 +81,7 @@ class TFBaseModelOutputWithCLSToken(ModelOutput):
hidden_states: Tuple[tf.Tensor, ...] | None = None hidden_states: Tuple[tf.Tensor, ...] | None = None
class TFCvtDropPath(tf.keras.layers.Layer): class TFCvtDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References: References:
(1) github.com:rwightman/pytorch-image-models (1) github.com:rwightman/pytorch-image-models
...@@ -100,7 +101,7 @@ class TFCvtDropPath(tf.keras.layers.Layer): ...@@ -100,7 +101,7 @@ class TFCvtDropPath(tf.keras.layers.Layer):
return (x / keep_prob) * random_tensor return (x / keep_prob) * random_tensor
class TFCvtEmbeddings(tf.keras.layers.Layer): class TFCvtEmbeddings(keras.layers.Layer):
"""Construct the Convolutional Token Embeddings.""" """Construct the Convolutional Token Embeddings."""
def __init__( def __init__(
...@@ -124,7 +125,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer): ...@@ -124,7 +125,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
padding=padding, padding=padding,
name="convolution_embeddings", name="convolution_embeddings",
) )
self.dropout = tf.keras.layers.Dropout(dropout_rate) self.dropout = keras.layers.Dropout(dropout_rate)
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution_embeddings(pixel_values) hidden_state = self.convolution_embeddings(pixel_values)
...@@ -140,7 +141,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer): ...@@ -140,7 +141,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
self.convolution_embeddings.build(None) self.convolution_embeddings.build(None)
class TFCvtConvEmbeddings(tf.keras.layers.Layer): class TFCvtConvEmbeddings(keras.layers.Layer):
"""Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.""" """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
def __init__( def __init__(
...@@ -154,9 +155,9 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): ...@@ -154,9 +155,9 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
**kwargs, **kwargs,
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) self.padding = keras.layers.ZeroPadding2D(padding=padding)
self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
self.projection = tf.keras.layers.Conv2D( self.projection = keras.layers.Conv2D(
filters=embed_dim, filters=embed_dim,
kernel_size=patch_size, kernel_size=patch_size,
strides=stride, strides=stride,
...@@ -166,7 +167,7 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): ...@@ -166,7 +167,7 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
name="projection", name="projection",
) )
# Using the same default epsilon as PyTorch # Using the same default epsilon as PyTorch
self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
self.num_channels = num_channels self.num_channels = num_channels
self.embed_dim = embed_dim self.embed_dim = embed_dim
...@@ -198,13 +199,13 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): ...@@ -198,13 +199,13 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
self.normalization.build([None, None, self.embed_dim]) self.normalization.build([None, None, self.embed_dim])
class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): class TFCvtSelfAttentionConvProjection(keras.layers.Layer):
"""Convolutional projection layer.""" """Convolutional projection layer."""
def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs): def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) self.padding = keras.layers.ZeroPadding2D(padding=padding)
self.convolution = tf.keras.layers.Conv2D( self.convolution = keras.layers.Conv2D(
filters=embed_dim, filters=embed_dim,
kernel_size=kernel_size, kernel_size=kernel_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
...@@ -215,7 +216,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): ...@@ -215,7 +216,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
groups=embed_dim, groups=embed_dim,
) )
# Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum) # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.embed_dim = embed_dim self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -235,7 +236,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): ...@@ -235,7 +236,7 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
self.normalization.build([None, None, None, self.embed_dim]) self.normalization.build([None, None, None, self.embed_dim])
class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): class TFCvtSelfAttentionLinearProjection(keras.layers.Layer):
"""Linear projection layer used to flatten tokens into 1D.""" """Linear projection layer used to flatten tokens into 1D."""
def call(self, hidden_state: tf.Tensor) -> tf.Tensor: def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
...@@ -246,7 +247,7 @@ class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): ...@@ -246,7 +247,7 @@ class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer):
return hidden_state return hidden_state
class TFCvtSelfAttentionProjection(tf.keras.layers.Layer): class TFCvtSelfAttentionProjection(keras.layers.Layer):
"""Convolutional Projection for Attention.""" """Convolutional Projection for Attention."""
def __init__( def __init__(
...@@ -280,7 +281,7 @@ class TFCvtSelfAttentionProjection(tf.keras.layers.Layer): ...@@ -280,7 +281,7 @@ class TFCvtSelfAttentionProjection(tf.keras.layers.Layer):
self.convolution_projection.build(None) self.convolution_projection.build(None)
class TFCvtSelfAttention(tf.keras.layers.Layer): class TFCvtSelfAttention(keras.layers.Layer):
""" """
Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
query, key, and value embeddings. query, key, and value embeddings.
...@@ -336,28 +337,28 @@ class TFCvtSelfAttention(tf.keras.layers.Layer): ...@@ -336,28 +337,28 @@ class TFCvtSelfAttention(tf.keras.layers.Layer):
name="convolution_projection_value", name="convolution_projection_value",
) )
self.projection_query = tf.keras.layers.Dense( self.projection_query = keras.layers.Dense(
units=embed_dim, units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias, use_bias=qkv_bias,
bias_initializer="zeros", bias_initializer="zeros",
name="projection_query", name="projection_query",
) )
self.projection_key = tf.keras.layers.Dense( self.projection_key = keras.layers.Dense(
units=embed_dim, units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias, use_bias=qkv_bias,
bias_initializer="zeros", bias_initializer="zeros",
name="projection_key", name="projection_key",
) )
self.projection_value = tf.keras.layers.Dense( self.projection_value = keras.layers.Dense(
units=embed_dim, units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias, use_bias=qkv_bias,
bias_initializer="zeros", bias_initializer="zeros",
name="projection_value", name="projection_value",
) )
self.dropout = tf.keras.layers.Dropout(attention_drop_rate) self.dropout = keras.layers.Dropout(attention_drop_rate)
def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor: def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
batch_size, hidden_size, _ = shape_list(hidden_state) batch_size, hidden_size, _ = shape_list(hidden_state)
...@@ -424,15 +425,15 @@ class TFCvtSelfAttention(tf.keras.layers.Layer): ...@@ -424,15 +425,15 @@ class TFCvtSelfAttention(tf.keras.layers.Layer):
self.projection_value.build([None, None, self.embed_dim]) self.projection_value.build([None, None, self.embed_dim])
class TFCvtSelfOutput(tf.keras.layers.Layer): class TFCvtSelfOutput(keras.layers.Layer):
"""Output of the Attention layer .""" """Output of the Attention layer ."""
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs): def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(drop_rate) self.dropout = keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -449,7 +450,7 @@ class TFCvtSelfOutput(tf.keras.layers.Layer): ...@@ -449,7 +450,7 @@ class TFCvtSelfOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.embed_dim]) self.dense.build([None, None, self.embed_dim])
class TFCvtAttention(tf.keras.layers.Layer): class TFCvtAttention(keras.layers.Layer):
"""Attention layer. First chunk of the convolutional transformer block.""" """Attention layer. First chunk of the convolutional transformer block."""
def __init__( def __init__(
...@@ -507,12 +508,12 @@ class TFCvtAttention(tf.keras.layers.Layer): ...@@ -507,12 +508,12 @@ class TFCvtAttention(tf.keras.layers.Layer):
self.dense_output.build(None) self.dense_output.build(None)
class TFCvtIntermediate(tf.keras.layers.Layer): class TFCvtIntermediate(keras.layers.Layer):
"""Intermediate dense layer. Second chunk of the convolutional transformer block.""" """Intermediate dense layer. Second chunk of the convolutional transformer block."""
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs): def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=int(embed_dim * mlp_ratio), units=int(embed_dim * mlp_ratio),
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="gelu", activation="gelu",
...@@ -533,17 +534,17 @@ class TFCvtIntermediate(tf.keras.layers.Layer): ...@@ -533,17 +534,17 @@ class TFCvtIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.embed_dim]) self.dense.build([None, None, self.embed_dim])
class TFCvtOutput(tf.keras.layers.Layer): class TFCvtOutput(keras.layers.Layer):
""" """
Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection. Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
""" """
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs): def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(drop_rate) self.dropout = keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.mlp_ratio = mlp_ratio self.mlp_ratio = mlp_ratio
...@@ -562,7 +563,7 @@ class TFCvtOutput(tf.keras.layers.Layer): ...@@ -562,7 +563,7 @@ class TFCvtOutput(tf.keras.layers.Layer):
self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)]) self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)])
class TFCvtLayer(tf.keras.layers.Layer): class TFCvtLayer(keras.layers.Layer):
""" """
Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
...@@ -611,11 +612,11 @@ class TFCvtLayer(tf.keras.layers.Layer): ...@@ -611,11 +612,11 @@ class TFCvtLayer(tf.keras.layers.Layer):
self.drop_path = ( self.drop_path = (
TFCvtDropPath(drop_path_rate, name="drop_path") TFCvtDropPath(drop_path_rate, name="drop_path")
if drop_path_rate > 0.0 if drop_path_rate > 0.0
else tf.keras.layers.Activation("linear", name="drop_path") else keras.layers.Activation("linear", name="drop_path")
) )
# Using the same default epsilon as PyTorch # Using the same default epsilon as PyTorch
self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
self.embed_dim = embed_dim self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
...@@ -659,7 +660,7 @@ class TFCvtLayer(tf.keras.layers.Layer): ...@@ -659,7 +660,7 @@ class TFCvtLayer(tf.keras.layers.Layer):
self.layernorm_after.build([None, None, self.embed_dim]) self.layernorm_after.build([None, None, self.embed_dim])
class TFCvtStage(tf.keras.layers.Layer): class TFCvtStage(keras.layers.Layer):
""" """
Cvt stage (encoder block). Each stage has 2 parts : Cvt stage (encoder block). Each stage has 2 parts :
- (1) A Convolutional Token Embedding layer - (1) A Convolutional Token Embedding layer
...@@ -755,7 +756,7 @@ class TFCvtStage(tf.keras.layers.Layer): ...@@ -755,7 +756,7 @@ class TFCvtStage(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFCvtEncoder(tf.keras.layers.Layer): class TFCvtEncoder(keras.layers.Layer):
""" """
Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
(depth) being 1, 2 and 10. (depth) being 1, 2 and 10.
...@@ -782,7 +783,7 @@ class TFCvtEncoder(tf.keras.layers.Layer): ...@@ -782,7 +783,7 @@ class TFCvtEncoder(tf.keras.layers.Layer):
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: ) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None all_hidden_states = () if output_hidden_states else None
hidden_state = pixel_values hidden_state = pixel_values
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width) # When running on CPU, `keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width)
# as input format. So change the input format to (batch_size, height, width, num_channels). # as input format. So change the input format to (batch_size, height, width, num_channels).
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1)) hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))
...@@ -817,7 +818,7 @@ class TFCvtEncoder(tf.keras.layers.Layer): ...@@ -817,7 +818,7 @@ class TFCvtEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFCvtMainLayer(tf.keras.layers.Layer): class TFCvtMainLayer(keras.layers.Layer):
"""Construct the Cvt model.""" """Construct the Cvt model."""
config_class = CvtConfig config_class = CvtConfig
...@@ -882,7 +883,7 @@ TFCVT_START_DOCSTRING = r""" ...@@ -882,7 +883,7 @@ TFCVT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -893,7 +894,7 @@ TFCVT_START_DOCSTRING = r""" ...@@ -893,7 +894,7 @@ TFCVT_START_DOCSTRING = r"""
- having all inputs as keyword arguments (like PyTorch models), or - having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments. - having all inputs as a list, tuple or dict in the first positional arguments.
This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
tensors in the first argument of the model call function: `model(inputs)`. tensors in the first argument of the model call function: `model(inputs)`.
</Tip> </Tip>
...@@ -1006,10 +1007,10 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification ...@@ -1006,10 +1007,10 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.cvt = TFCvtMainLayer(config, name="cvt") self.cvt = TFCvtMainLayer(config, name="cvt")
# Using same default epsilon as in the original implementation. # Using same default epsilon as in the original implementation.
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm") self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
# Classifier head # Classifier head
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
use_bias=True, use_bias=True,
......
...@@ -37,6 +37,7 @@ from ...modeling_tf_utils import ( ...@@ -37,6 +37,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -101,7 +102,7 @@ class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling): ...@@ -101,7 +102,7 @@ class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
attentions: Tuple[tf.Tensor] | None = None attentions: Tuple[tf.Tensor] | None = None
class TFData2VecVisionDropPath(tf.keras.layers.Layer): class TFData2VecVisionDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References: References:
(1) github.com:rwightman/pytorch-image-models (1) github.com:rwightman/pytorch-image-models
...@@ -121,7 +122,7 @@ class TFData2VecVisionDropPath(tf.keras.layers.Layer): ...@@ -121,7 +122,7 @@ class TFData2VecVisionDropPath(tf.keras.layers.Layer):
return x return x
class TFData2VecVisionEmbeddings(tf.keras.layers.Layer): class TFData2VecVisionEmbeddings(keras.layers.Layer):
""" """
Construct the CLS token, position and patch embeddings. Optionally, also the mask token. Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
...@@ -135,7 +136,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer): ...@@ -135,7 +136,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer):
self.num_patches = self.patch_embeddings.num_patches self.num_patches = self.patch_embeddings.num_patches
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
self.cls_token = self.add_weight( self.cls_token = self.add_weight(
...@@ -193,7 +194,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer): ...@@ -193,7 +194,7 @@ class TFData2VecVisionEmbeddings(tf.keras.layers.Layer):
return embeddings return embeddings
class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): class TFData2VecVisionPatchEmbeddings(keras.layers.Layer):
""" """
Image to Patch Embedding. Image to Patch Embedding.
""" """
...@@ -215,7 +216,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): ...@@ -215,7 +216,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer):
self.patch_shape = patch_shape self.patch_shape = patch_shape
self.num_channels = num_channels self.num_channels = num_channels
self.projection = tf.keras.layers.Conv2D( self.projection = keras.layers.Conv2D(
filters=hidden_size, filters=hidden_size,
kernel_size=patch_size, kernel_size=patch_size,
strides=patch_size, strides=patch_size,
...@@ -240,7 +241,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): ...@@ -240,7 +241,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer):
f" ({self.image_size[0]}*{self.image_size[1]})." f" ({self.image_size[0]}*{self.image_size[1]})."
) )
# When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
# So change the input format from `NCHW` to `NHWC`. # So change the input format from `NCHW` to `NHWC`.
# shape = (batch_size, in_height, in_width, in_channels=num_channels) # shape = (batch_size, in_height, in_width, in_channels=num_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
...@@ -262,7 +263,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer): ...@@ -262,7 +263,7 @@ class TFData2VecVisionPatchEmbeddings(tf.keras.layers.Layer):
self.projection.build([None, None, None, self.num_channels]) self.projection.build([None, None, None, self.num_channels])
class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): class TFData2VecVisionSelfAttention(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -277,19 +278,19 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): ...@@ -277,19 +278,19 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, units=self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="key", name="key",
use_bias=False, use_bias=False,
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
if window_size: if window_size:
self.relative_position_bias = TFData2VecVisionRelativePositionBias( self.relative_position_bias = TFData2VecVisionRelativePositionBias(
...@@ -376,7 +377,7 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): ...@@ -376,7 +377,7 @@ class TFData2VecVisionSelfAttention(tf.keras.layers.Layer):
self.relative_position_bias.build(None) self.relative_position_bias.build(None)
class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): class TFData2VecVisionSelfOutput(keras.layers.Layer):
""" """
The residual connection is defined in TFData2VecVisionLayer instead of here (as is the case with other models), due The residual connection is defined in TFData2VecVisionLayer instead of here (as is the case with other models), due
to the layernorm applied before each block. to the layernorm applied before each block.
...@@ -385,10 +386,10 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): ...@@ -385,10 +386,10 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs): def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor:
...@@ -406,7 +407,7 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): ...@@ -406,7 +407,7 @@ class TFData2VecVisionSelfOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFData2VecVisionAttention(tf.keras.layers.Layer): class TFData2VecVisionAttention(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -451,11 +452,11 @@ class TFData2VecVisionAttention(tf.keras.layers.Layer): ...@@ -451,11 +452,11 @@ class TFData2VecVisionAttention(tf.keras.layers.Layer):
# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision
class TFData2VecVisionIntermediate(tf.keras.layers.Layer): class TFData2VecVisionIntermediate(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs): def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -480,14 +481,14 @@ class TFData2VecVisionIntermediate(tf.keras.layers.Layer): ...@@ -480,14 +481,14 @@ class TFData2VecVisionIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFData2VecVisionOutput(tf.keras.layers.Layer): class TFData2VecVisionOutput(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs): def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -505,7 +506,7 @@ class TFData2VecVisionOutput(tf.keras.layers.Layer): ...@@ -505,7 +506,7 @@ class TFData2VecVisionOutput(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.intermediate_size]) self.dense.build([None, None, self.config.intermediate_size])
class TFData2VecVisionLayer(tf.keras.layers.Layer): class TFData2VecVisionLayer(keras.layers.Layer):
"""This corresponds to the Block class in the timm implementation.""" """This corresponds to the Block class in the timm implementation."""
def __init__( def __init__(
...@@ -518,18 +519,14 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer): ...@@ -518,18 +519,14 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer):
self.intermediate = TFData2VecVisionIntermediate(config, name="intermediate") self.intermediate = TFData2VecVisionIntermediate(config, name="intermediate")
self.data2vec_output = TFData2VecVisionOutput(config, name="output") self.data2vec_output = TFData2VecVisionOutput(config, name="output")
self.layernorm_before = tf.keras.layers.LayerNormalization( self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
epsilon=config.layer_norm_eps, name="layernorm_before" self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
)
self.layernorm_after = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="layernorm_after"
)
# Using `layers.Activation` instead of `tf.identity` to better control `training` # Using `layers.Activation` instead of `tf.identity` to better control `training`
# behaviour. # behaviour.
self.drop_path = ( self.drop_path = (
TFData2VecVisionDropPath(drop_path_rate, name="drop_path") TFData2VecVisionDropPath(drop_path_rate, name="drop_path")
if drop_path_rate > 0.0 if drop_path_rate > 0.0
else tf.keras.layers.Activation("linear", name="drop_path") else keras.layers.Activation("linear", name="drop_path")
) )
self.init_values = config.layer_scale_init_value self.init_values = config.layer_scale_init_value
...@@ -619,7 +616,7 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer): ...@@ -619,7 +616,7 @@ class TFData2VecVisionLayer(tf.keras.layers.Layer):
# Taken and modified from here: # Taken and modified from here:
# https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/beit/beit.py#L28 # https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/beit/beit.py#L28
class TFData2VecVisionRelativePositionBias(tf.keras.layers.Layer): class TFData2VecVisionRelativePositionBias(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: tuple, **kwargs) -> None: def __init__(self, config: Data2VecVisionConfig, window_size: tuple, **kwargs) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -675,7 +672,7 @@ class TFData2VecVisionRelativePositionBias(tf.keras.layers.Layer): ...@@ -675,7 +672,7 @@ class TFData2VecVisionRelativePositionBias(tf.keras.layers.Layer):
return tf.transpose(relative_position_bias, [2, 0, 1]) return tf.transpose(relative_position_bias, [2, 0, 1])
class TFData2VecVisionEncoder(tf.keras.layers.Layer): class TFData2VecVisionEncoder(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -753,7 +750,7 @@ class TFData2VecVisionEncoder(tf.keras.layers.Layer): ...@@ -753,7 +750,7 @@ class TFData2VecVisionEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFData2VecVisionMainLayer(tf.keras.layers.Layer): class TFData2VecVisionMainLayer(keras.layers.Layer):
config_class = Data2VecVisionConfig config_class = Data2VecVisionConfig
def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = True, **kwargs): def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = True, **kwargs):
...@@ -769,14 +766,14 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer): ...@@ -769,14 +766,14 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer):
self.layernorm = ( self.layernorm = (
tf.identity tf.identity
if config.use_mean_pooling if config.use_mean_pooling
else tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") else keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
) )
# We are setting the `data_format` like so because from here on we will revert to the # We are setting the `data_format` like so because from here on we will revert to the
# NCHW output format # NCHW output format
self.pooler = TFData2VecVisionPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFData2VecVisionPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings.patch_embeddings return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
...@@ -861,11 +858,11 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer): ...@@ -861,11 +858,11 @@ class TFData2VecVisionMainLayer(tf.keras.layers.Layer):
self.pooler.build(None) self.pooler.build(None)
class TFData2VecVisionPooler(tf.keras.layers.Layer): class TFData2VecVisionPooler(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs): def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.layernorm = ( self.layernorm = (
tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
if config.use_mean_pooling if config.use_mean_pooling
else None else None
) )
...@@ -909,7 +906,7 @@ DATA2VEC_VISION_START_DOCSTRING = r""" ...@@ -909,7 +906,7 @@ DATA2VEC_VISION_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.). etc.).
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -1049,7 +1046,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF ...@@ -1049,7 +1046,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF
self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=True, name="data2vec_vision") self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=True, name="data2vec_vision")
# Classifier head # Classifier head
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -1118,7 +1115,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF ...@@ -1118,7 +1115,7 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF
self.classifier.build([None, None, self.config.hidden_size]) self.classifier.build([None, None, self.config.hidden_size])
class TFData2VecVisionConvModule(tf.keras.layers.Layer): class TFData2VecVisionConvModule(keras.layers.Layer):
""" """
A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
...@@ -1137,7 +1134,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer): ...@@ -1137,7 +1134,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer):
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
self.conv = tf.keras.layers.Conv2D( self.conv = keras.layers.Conv2D(
filters=out_channels, filters=out_channels,
kernel_size=kernel_size, kernel_size=kernel_size,
padding=padding, padding=padding,
...@@ -1145,7 +1142,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer): ...@@ -1145,7 +1142,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer):
dilation_rate=dilation, dilation_rate=dilation,
name="conv", name="conv",
) )
self.bn = tf.keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5) self.bn = keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5)
self.activation = tf.nn.relu self.activation = tf.nn.relu
self.in_channels = in_channels self.in_channels = in_channels
self.out_channels = out_channels self.out_channels = out_channels
...@@ -1168,7 +1165,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer): ...@@ -1168,7 +1165,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer):
self.bn.build((None, None, None, self.out_channels)) self.bn.build((None, None, None, self.out_channels))
class TFAdaptiveAvgPool2D(tf.keras.layers.Layer): class TFAdaptiveAvgPool2D(keras.layers.Layer):
def __init__(self, output_dims: Tuple[int, int], input_ordering: str = "NHWC", **kwargs): def __init__(self, output_dims: Tuple[int, int], input_ordering: str = "NHWC", **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_dims = output_dims self.output_dims = output_dims
...@@ -1292,7 +1289,7 @@ class TFAdaptiveAvgPool2D(tf.keras.layers.Layer): ...@@ -1292,7 +1289,7 @@ class TFAdaptiveAvgPool2D(tf.keras.layers.Layer):
return self.pseudo_1d_pool(h_pooled, h_pooling=False) return self.pseudo_1d_pool(h_pooled, h_pooling=False)
class TFData2VecVisionPyramidPoolingModule(tf.keras.layers.Layer): class TFData2VecVisionPyramidPoolingModule(keras.layers.Layer):
""" """
Pyramid Pooling Module (PPM) used in PSPNet. Pyramid Pooling Module (PPM) used in PSPNet.
...@@ -1342,7 +1339,7 @@ class TFData2VecVisionPyramidPoolingModule(tf.keras.layers.Layer): ...@@ -1342,7 +1339,7 @@ class TFData2VecVisionPyramidPoolingModule(tf.keras.layers.Layer):
layer_module.build(None) layer_module.build(None)
class TFData2VecVisionUperHead(tf.keras.layers.Layer): class TFData2VecVisionUperHead(keras.layers.Layer):
""" """
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://arxiv.org/abs/1807.10221). [UPerNet](https://arxiv.org/abs/1807.10221).
...@@ -1356,7 +1353,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer): ...@@ -1356,7 +1353,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer):
self.pool_scales = config.pool_scales # e.g. (1, 2, 3, 6) self.pool_scales = config.pool_scales # e.g. (1, 2, 3, 6)
self.in_channels = [config.hidden_size] * 4 # e.g. [768, 768, 768, 768] self.in_channels = [config.hidden_size] * 4 # e.g. [768, 768, 768, 768]
self.channels = config.hidden_size self.channels = config.hidden_size
self.classifier = tf.keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier") self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
# PSP Module # PSP Module
self.psp_modules = TFData2VecVisionPyramidPoolingModule( self.psp_modules = TFData2VecVisionPyramidPoolingModule(
...@@ -1452,7 +1449,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer): ...@@ -1452,7 +1449,7 @@ class TFData2VecVisionUperHead(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFData2VecVisionFCNHead(tf.keras.layers.Layer): class TFData2VecVisionFCNHead(keras.layers.Layer):
""" """
Fully Convolution Networks for Semantic Segmentation. This head is implemented from Fully Convolution Networks for Semantic Segmentation. This head is implemented from
[FCNNet](https://arxiv.org/abs/1411.4038). [FCNNet](https://arxiv.org/abs/1411.4038).
...@@ -1516,7 +1513,7 @@ class TFData2VecVisionFCNHead(tf.keras.layers.Layer): ...@@ -1516,7 +1513,7 @@ class TFData2VecVisionFCNHead(tf.keras.layers.Layer):
name="conv_cat", name="conv_cat",
) )
self.classifier = tf.keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier") self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor: def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
# just take the relevant feature maps # just take the relevant feature maps
...@@ -1555,15 +1552,15 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel): ...@@ -1555,15 +1552,15 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
# FPNs # FPNs
self.fpn1 = [ self.fpn1 = [
tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"), keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"),
tf.keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5), keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5),
tf.keras.layers.Activation("gelu"), keras.layers.Activation("gelu"),
tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"), keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"),
] ]
self.fpn2 = [tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0")] self.fpn2 = [keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0")]
self.fpn3 = tf.identity self.fpn3 = tf.identity
self.fpn4 = tf.keras.layers.MaxPool2D(pool_size=2, strides=2) self.fpn4 = keras.layers.MaxPool2D(pool_size=2, strides=2)
# Semantic segmentation head(s) # Semantic segmentation head(s)
self.decode_head = TFData2VecVisionUperHead(config, name="decode_head") self.decode_head = TFData2VecVisionUperHead(config, name="decode_head")
...@@ -1582,7 +1579,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel): ...@@ -1582,7 +1579,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
if auxiliary_logits is not None: if auxiliary_logits is not None:
upsampled_auxiliary_logits = tf.image.resize(auxiliary_logits, size=label_interp_shape, method="bilinear") upsampled_auxiliary_logits = tf.image.resize(auxiliary_logits, size=label_interp_shape, method="bilinear")
# compute weighted loss # compute weighted loss
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
# Copied from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics. # Copied from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
# Utility to mask the index to ignore during computing the loss. # Utility to mask the index to ignore during computing the loss.
......
...@@ -39,6 +39,7 @@ from ...modeling_tf_utils import ( ...@@ -39,6 +39,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
unpack_inputs, unpack_inputs,
) )
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
...@@ -58,10 +59,10 @@ TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -58,10 +59,10 @@ TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
class TFDebertaContextPooler(tf.keras.layers.Layer): class TFDebertaContextPooler(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.pooler_hidden_size, name="dense") self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout") self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout")
self.config = config self.config = config
...@@ -90,7 +91,7 @@ class TFDebertaContextPooler(tf.keras.layers.Layer): ...@@ -90,7 +91,7 @@ class TFDebertaContextPooler(tf.keras.layers.Layer):
self.dropout.build(None) self.dropout.build(None)
class TFDebertaXSoftmax(tf.keras.layers.Layer): class TFDebertaXSoftmax(keras.layers.Layer):
""" """
Masked Softmax which is optimized for saving memory Masked Softmax which is optimized for saving memory
...@@ -112,7 +113,7 @@ class TFDebertaXSoftmax(tf.keras.layers.Layer): ...@@ -112,7 +113,7 @@ class TFDebertaXSoftmax(tf.keras.layers.Layer):
return output return output
class TFDebertaStableDropout(tf.keras.layers.Layer): class TFDebertaStableDropout(keras.layers.Layer):
""" """
Optimized dropout module for stabilizing the training Optimized dropout module for stabilizing the training
...@@ -152,7 +153,7 @@ class TFDebertaStableDropout(tf.keras.layers.Layer): ...@@ -152,7 +153,7 @@ class TFDebertaStableDropout(tf.keras.layers.Layer):
return inputs return inputs
class TFDebertaLayerNorm(tf.keras.layers.Layer): class TFDebertaLayerNorm(keras.layers.Layer):
"""LayerNorm module in the TF style (epsilon inside the square root).""" """LayerNorm module in the TF style (epsilon inside the square root)."""
def __init__(self, size, eps=1e-12, **kwargs): def __init__(self, size, eps=1e-12, **kwargs):
...@@ -172,11 +173,11 @@ class TFDebertaLayerNorm(tf.keras.layers.Layer): ...@@ -172,11 +173,11 @@ class TFDebertaLayerNorm(tf.keras.layers.Layer):
return self.gamma * (x - mean) / std + self.beta return self.gamma * (x - mean) / std + self.beta
class TFDebertaSelfOutput(tf.keras.layers.Layer): class TFDebertaSelfOutput(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.dense = keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config self.config = config
...@@ -201,7 +202,7 @@ class TFDebertaSelfOutput(tf.keras.layers.Layer): ...@@ -201,7 +202,7 @@ class TFDebertaSelfOutput(tf.keras.layers.Layer):
self.dropout.build(None) self.dropout.build(None)
class TFDebertaAttention(tf.keras.layers.Layer): class TFDebertaAttention(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.self = TFDebertaDisentangledSelfAttention(config, name="self") self.self = TFDebertaDisentangledSelfAttention(config, name="self")
...@@ -249,11 +250,11 @@ class TFDebertaAttention(tf.keras.layers.Layer): ...@@ -249,11 +250,11 @@ class TFDebertaAttention(tf.keras.layers.Layer):
self.dense_output.build(None) self.dense_output.build(None)
class TFDebertaIntermediate(tf.keras.layers.Layer): class TFDebertaIntermediate(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -278,14 +279,14 @@ class TFDebertaIntermediate(tf.keras.layers.Layer): ...@@ -278,14 +279,14 @@ class TFDebertaIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFDebertaOutput(tf.keras.layers.Layer): class TFDebertaOutput(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config self.config = config
...@@ -311,7 +312,7 @@ class TFDebertaOutput(tf.keras.layers.Layer): ...@@ -311,7 +312,7 @@ class TFDebertaOutput(tf.keras.layers.Layer):
self.dropout.build(None) self.dropout.build(None)
class TFDebertaLayer(tf.keras.layers.Layer): class TFDebertaLayer(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -362,7 +363,7 @@ class TFDebertaLayer(tf.keras.layers.Layer): ...@@ -362,7 +363,7 @@ class TFDebertaLayer(tf.keras.layers.Layer):
self.bert_output.build(None) self.bert_output.build(None)
class TFDebertaEncoder(tf.keras.layers.Layer): class TFDebertaEncoder(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -543,7 +544,7 @@ def torch_gather(x, indices, gather_axis): ...@@ -543,7 +544,7 @@ def torch_gather(x, indices, gather_axis):
return gathered return gathered
class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
""" """
Disentangled self-attention module Disentangled self-attention module
...@@ -564,7 +565,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -564,7 +565,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
self.num_attention_heads = config.num_attention_heads self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.in_proj = tf.keras.layers.Dense( self.in_proj = keras.layers.Dense(
self.all_head_size * 3, self.all_head_size * 3,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="in_proj", name="in_proj",
...@@ -576,13 +577,13 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -576,13 +577,13 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
self.talking_head = getattr(config, "talking_head", False) self.talking_head = getattr(config, "talking_head", False)
if self.talking_head: if self.talking_head:
self.head_logits_proj = tf.keras.layers.Dense( self.head_logits_proj = keras.layers.Dense(
self.num_attention_heads, self.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="head_logits_proj", name="head_logits_proj",
use_bias=False, use_bias=False,
) )
self.head_weights_proj = tf.keras.layers.Dense( self.head_weights_proj = keras.layers.Dense(
self.num_attention_heads, self.num_attention_heads,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="head_weights_proj", name="head_weights_proj",
...@@ -597,14 +598,14 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -597,14 +598,14 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
self.max_relative_positions = config.max_position_embeddings self.max_relative_positions = config.max_position_embeddings
self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout") self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout")
if "c2p" in self.pos_att_type: if "c2p" in self.pos_att_type:
self.pos_proj = tf.keras.layers.Dense( self.pos_proj = keras.layers.Dense(
self.all_head_size, self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="pos_proj", name="pos_proj",
use_bias=False, use_bias=False,
) )
if "p2c" in self.pos_att_type: if "p2c" in self.pos_att_type:
self.pos_q_proj = tf.keras.layers.Dense( self.pos_q_proj = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj"
) )
...@@ -616,10 +617,10 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -616,10 +617,10 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
return return
self.built = True self.built = True
self.q_bias = self.add_weight( self.q_bias = self.add_weight(
name="q_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros() name="q_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
) )
self.v_bias = self.add_weight( self.v_bias = self.add_weight(
name="v_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros() name="v_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
) )
if getattr(self, "in_proj", None) is not None: if getattr(self, "in_proj", None) is not None:
with tf.name_scope(self.in_proj.name): with tf.name_scope(self.in_proj.name):
...@@ -818,7 +819,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -818,7 +819,7 @@ class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer):
return score return score
class TFDebertaEmbeddings(tf.keras.layers.Layer): class TFDebertaEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -831,13 +832,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): ...@@ -831,13 +832,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
self.position_biased_input = getattr(config, "position_biased_input", True) self.position_biased_input = getattr(config, "position_biased_input", True)
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
if self.embedding_size != config.hidden_size: if self.embedding_size != config.hidden_size:
self.embed_proj = tf.keras.layers.Dense( self.embed_proj = keras.layers.Dense(
config.hidden_size, config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="embed_proj", name="embed_proj",
use_bias=False, use_bias=False,
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
def build(self, input_shape=None): def build(self, input_shape=None):
...@@ -937,13 +938,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): ...@@ -937,13 +938,13 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer): class TFDebertaPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size) self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=self.embedding_size, units=self.embedding_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
...@@ -953,7 +954,7 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -953,7 +954,7 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = get_tf_activation(config.hidden_act) self.transform_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
...@@ -975,8 +976,8 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -975,8 +976,8 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.embedding_size]) self.LayerNorm.build([None, None, self.embedding_size])
class TFDebertaLMPredictionHead(tf.keras.layers.Layer): class TFDebertaLMPredictionHead(keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -998,7 +999,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer): ...@@ -998,7 +999,7 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
with tf.name_scope(self.transform.name): with tf.name_scope(self.transform.name):
self.transform.build(None) self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable): def set_output_embeddings(self, value: tf.Variable):
...@@ -1023,8 +1024,8 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer): ...@@ -1023,8 +1024,8 @@ class TFDebertaLMPredictionHead(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFDebertaOnlyMLMHead(tf.keras.layers.Layer): class TFDebertaOnlyMLMHead(keras.layers.Layer):
def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions") self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions")
...@@ -1043,7 +1044,7 @@ class TFDebertaOnlyMLMHead(tf.keras.layers.Layer): ...@@ -1043,7 +1044,7 @@ class TFDebertaOnlyMLMHead(tf.keras.layers.Layer):
# @keras_serializable # @keras_serializable
class TFDebertaMainLayer(tf.keras.layers.Layer): class TFDebertaMainLayer(keras.layers.Layer):
config_class = DebertaConfig config_class = DebertaConfig
def __init__(self, config: DebertaConfig, **kwargs): def __init__(self, config: DebertaConfig, **kwargs):
...@@ -1054,7 +1055,7 @@ class TFDebertaMainLayer(tf.keras.layers.Layer): ...@@ -1054,7 +1055,7 @@ class TFDebertaMainLayer(tf.keras.layers.Layer):
self.embeddings = TFDebertaEmbeddings(config, name="embeddings") self.embeddings = TFDebertaEmbeddings(config, name="embeddings")
self.encoder = TFDebertaEncoder(config, name="encoder") self.encoder = TFDebertaEncoder(config, name="encoder")
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
def set_input_embeddings(self, value: tf.Variable): def set_input_embeddings(self, value: tf.Variable):
...@@ -1153,7 +1154,7 @@ DEBERTA_START_DOCSTRING = r""" ...@@ -1153,7 +1154,7 @@ DEBERTA_START_DOCSTRING = r"""
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -1299,7 +1300,7 @@ class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -1299,7 +1300,7 @@ class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLos
self.deberta = TFDebertaMainLayer(config, name="deberta") self.deberta = TFDebertaMainLayer(config, name="deberta")
self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls") self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
@unpack_inputs @unpack_inputs
...@@ -1385,7 +1386,7 @@ class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceCla ...@@ -1385,7 +1386,7 @@ class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceCla
drop_out = getattr(config, "cls_dropout", None) drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout") self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout")
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -1479,8 +1480,8 @@ class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassific ...@@ -1479,8 +1480,8 @@ class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassific
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.deberta = TFDebertaMainLayer(config, name="deberta") self.deberta = TFDebertaMainLayer(config, name="deberta")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1562,7 +1563,7 @@ class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnswerin ...@@ -1562,7 +1563,7 @@ class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnswerin
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.deberta = TFDebertaMainLayer(config, name="deberta") self.deberta = TFDebertaMainLayer(config, name="deberta")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
......
...@@ -39,6 +39,7 @@ from ...modeling_tf_utils import ( ...@@ -39,6 +39,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
unpack_inputs, unpack_inputs,
) )
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
...@@ -58,10 +59,10 @@ TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -58,10 +59,10 @@ TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaContextPooler with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaContextPooler with Deberta->DebertaV2
class TFDebertaV2ContextPooler(tf.keras.layers.Layer): class TFDebertaV2ContextPooler(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.pooler_hidden_size, name="dense") self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
self.dropout = TFDebertaV2StableDropout(config.pooler_dropout, name="dropout") self.dropout = TFDebertaV2StableDropout(config.pooler_dropout, name="dropout")
self.config = config self.config = config
...@@ -91,7 +92,7 @@ class TFDebertaV2ContextPooler(tf.keras.layers.Layer): ...@@ -91,7 +92,7 @@ class TFDebertaV2ContextPooler(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaXSoftmax with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaXSoftmax with Deberta->DebertaV2
class TFDebertaV2XSoftmax(tf.keras.layers.Layer): class TFDebertaV2XSoftmax(keras.layers.Layer):
""" """
Masked Softmax which is optimized for saving memory Masked Softmax which is optimized for saving memory
...@@ -114,7 +115,7 @@ class TFDebertaV2XSoftmax(tf.keras.layers.Layer): ...@@ -114,7 +115,7 @@ class TFDebertaV2XSoftmax(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout with Deberta->DebertaV2
class TFDebertaV2StableDropout(tf.keras.layers.Layer): class TFDebertaV2StableDropout(keras.layers.Layer):
""" """
Optimized dropout module for stabilizing the training Optimized dropout module for stabilizing the training
...@@ -155,11 +156,11 @@ class TFDebertaV2StableDropout(tf.keras.layers.Layer): ...@@ -155,11 +156,11 @@ class TFDebertaV2StableDropout(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaSelfOutput with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaSelfOutput with Deberta->DebertaV2
class TFDebertaV2SelfOutput(tf.keras.layers.Layer): class TFDebertaV2SelfOutput(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.dense = keras.layers.Dense(config.hidden_size, name="dense")
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config self.config = config
...@@ -185,7 +186,7 @@ class TFDebertaV2SelfOutput(tf.keras.layers.Layer): ...@@ -185,7 +186,7 @@ class TFDebertaV2SelfOutput(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2
class TFDebertaV2Attention(tf.keras.layers.Layer): class TFDebertaV2Attention(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.self = TFDebertaV2DisentangledSelfAttention(config, name="self") self.self = TFDebertaV2DisentangledSelfAttention(config, name="self")
...@@ -234,11 +235,11 @@ class TFDebertaV2Attention(tf.keras.layers.Layer): ...@@ -234,11 +235,11 @@ class TFDebertaV2Attention(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2
class TFDebertaV2Intermediate(tf.keras.layers.Layer): class TFDebertaV2Intermediate(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -264,14 +265,14 @@ class TFDebertaV2Intermediate(tf.keras.layers.Layer): ...@@ -264,14 +265,14 @@ class TFDebertaV2Intermediate(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput with Deberta->DebertaV2
class TFDebertaV2Output(tf.keras.layers.Layer): class TFDebertaV2Output(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config self.config = config
...@@ -298,7 +299,7 @@ class TFDebertaV2Output(tf.keras.layers.Layer): ...@@ -298,7 +299,7 @@ class TFDebertaV2Output(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer with Deberta->DebertaV2
class TFDebertaV2Layer(tf.keras.layers.Layer): class TFDebertaV2Layer(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -349,7 +350,7 @@ class TFDebertaV2Layer(tf.keras.layers.Layer): ...@@ -349,7 +350,7 @@ class TFDebertaV2Layer(tf.keras.layers.Layer):
self.bert_output.build(None) self.bert_output.build(None)
class TFDebertaV2ConvLayer(tf.keras.layers.Layer): class TFDebertaV2ConvLayer(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -357,7 +358,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer): ...@@ -357,7 +358,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer):
# groups = getattr(config, "conv_groups", 1) # groups = getattr(config, "conv_groups", 1)
self.conv_act = get_tf_activation(getattr(config, "conv_act", "tanh")) self.conv_act = get_tf_activation(getattr(config, "conv_act", "tanh"))
self.padding = (self.kernel_size - 1) // 2 self.padding = (self.kernel_size - 1) // 2
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
self.config = config self.config = config
...@@ -412,7 +413,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer): ...@@ -412,7 +413,7 @@ class TFDebertaV2ConvLayer(tf.keras.layers.Layer):
return output_states return output_states
class TFDebertaV2Encoder(tf.keras.layers.Layer): class TFDebertaV2Encoder(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -433,7 +434,7 @@ class TFDebertaV2Encoder(tf.keras.layers.Layer): ...@@ -433,7 +434,7 @@ class TFDebertaV2Encoder(tf.keras.layers.Layer):
self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")] self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
if "layer_norm" in self.norm_rel_ebd: if "layer_norm" in self.norm_rel_ebd:
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None
...@@ -634,7 +635,7 @@ def take_along_axis(x, indices): ...@@ -634,7 +635,7 @@ def take_along_axis(x, indices):
return gathered return gathered
class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
""" """
Disentangled self-attention module Disentangled self-attention module
...@@ -656,19 +657,19 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -656,19 +657,19 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
_attention_head_size = config.hidden_size // config.num_attention_heads _attention_head_size = config.hidden_size // config.num_attention_heads
self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size) self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query_proj = tf.keras.layers.Dense( self.query_proj = keras.layers.Dense(
self.all_head_size, self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="query_proj", name="query_proj",
use_bias=True, use_bias=True,
) )
self.key_proj = tf.keras.layers.Dense( self.key_proj = keras.layers.Dense(
self.all_head_size, self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="key_proj", name="key_proj",
use_bias=True, use_bias=True,
) )
self.value_proj = tf.keras.layers.Dense( self.value_proj = keras.layers.Dense(
self.all_head_size, self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="value_proj", name="value_proj",
...@@ -692,14 +693,14 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -692,14 +693,14 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
if not self.share_att_key: if not self.share_att_key:
if "c2p" in self.pos_att_type: if "c2p" in self.pos_att_type:
self.pos_key_proj = tf.keras.layers.Dense( self.pos_key_proj = keras.layers.Dense(
self.all_head_size, self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="pos_proj", name="pos_proj",
use_bias=True, use_bias=True,
) )
if "p2c" in self.pos_att_type: if "p2c" in self.pos_att_type:
self.pos_query_proj = tf.keras.layers.Dense( self.pos_query_proj = keras.layers.Dense(
self.all_head_size, self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="pos_q_proj", name="pos_q_proj",
...@@ -925,7 +926,7 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer): ...@@ -925,7 +926,7 @@ class TFDebertaV2DisentangledSelfAttention(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaEmbeddings Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaEmbeddings Deberta->DebertaV2
class TFDebertaV2Embeddings(tf.keras.layers.Layer): class TFDebertaV2Embeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -938,13 +939,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): ...@@ -938,13 +939,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
self.position_biased_input = getattr(config, "position_biased_input", True) self.position_biased_input = getattr(config, "position_biased_input", True)
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
if self.embedding_size != config.hidden_size: if self.embedding_size != config.hidden_size:
self.embed_proj = tf.keras.layers.Dense( self.embed_proj = keras.layers.Dense(
config.hidden_size, config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="embed_proj", name="embed_proj",
use_bias=False, use_bias=False,
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
def build(self, input_shape=None): def build(self, input_shape=None):
...@@ -1045,13 +1046,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): ...@@ -1045,13 +1046,13 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPredictionHeadTransform with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPredictionHeadTransform with Deberta->DebertaV2
class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer): class TFDebertaV2PredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size) self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=self.embedding_size, units=self.embedding_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
...@@ -1061,7 +1062,7 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer): ...@@ -1061,7 +1062,7 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = get_tf_activation(config.hidden_act) self.transform_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
...@@ -1084,8 +1085,8 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer): ...@@ -1084,8 +1085,8 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead with Deberta->DebertaV2
class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): class TFDebertaV2LMPredictionHead(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -1107,7 +1108,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): ...@@ -1107,7 +1108,7 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
with tf.name_scope(self.transform.name): with tf.name_scope(self.transform.name):
self.transform.build(None) self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable): def set_output_embeddings(self, value: tf.Variable):
...@@ -1133,8 +1134,8 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): ...@@ -1133,8 +1134,8 @@ class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOnlyMLMHead with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOnlyMLMHead with Deberta->DebertaV2
class TFDebertaV2OnlyMLMHead(tf.keras.layers.Layer): class TFDebertaV2OnlyMLMHead(keras.layers.Layer):
def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFDebertaV2LMPredictionHead(config, input_embeddings, name="predictions") self.predictions = TFDebertaV2LMPredictionHead(config, input_embeddings, name="predictions")
...@@ -1153,7 +1154,7 @@ class TFDebertaV2OnlyMLMHead(tf.keras.layers.Layer): ...@@ -1153,7 +1154,7 @@ class TFDebertaV2OnlyMLMHead(tf.keras.layers.Layer):
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer with Deberta->DebertaV2 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer with Deberta->DebertaV2
class TFDebertaV2MainLayer(tf.keras.layers.Layer): class TFDebertaV2MainLayer(keras.layers.Layer):
config_class = DebertaV2Config config_class = DebertaV2Config
def __init__(self, config: DebertaV2Config, **kwargs): def __init__(self, config: DebertaV2Config, **kwargs):
...@@ -1164,7 +1165,7 @@ class TFDebertaV2MainLayer(tf.keras.layers.Layer): ...@@ -1164,7 +1165,7 @@ class TFDebertaV2MainLayer(tf.keras.layers.Layer):
self.embeddings = TFDebertaV2Embeddings(config, name="embeddings") self.embeddings = TFDebertaV2Embeddings(config, name="embeddings")
self.encoder = TFDebertaV2Encoder(config, name="encoder") self.encoder = TFDebertaV2Encoder(config, name="encoder")
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
def set_input_embeddings(self, value: tf.Variable): def set_input_embeddings(self, value: tf.Variable):
...@@ -1264,7 +1265,7 @@ DEBERTA_START_DOCSTRING = r""" ...@@ -1264,7 +1265,7 @@ DEBERTA_START_DOCSTRING = r"""
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -1412,7 +1413,7 @@ class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelin ...@@ -1412,7 +1413,7 @@ class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelin
self.deberta = TFDebertaV2MainLayer(config, name="deberta") self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.mlm = TFDebertaV2OnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls") self.mlm = TFDebertaV2OnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
@unpack_inputs @unpack_inputs
...@@ -1499,7 +1500,7 @@ class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenc ...@@ -1499,7 +1500,7 @@ class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenc
drop_out = getattr(config, "cls_dropout", None) drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = TFDebertaV2StableDropout(drop_out, name="cls_dropout") self.dropout = TFDebertaV2StableDropout(drop_out, name="cls_dropout")
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -1594,8 +1595,8 @@ class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClass ...@@ -1594,8 +1595,8 @@ class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClass
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.deberta = TFDebertaV2MainLayer(config, name="deberta") self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1678,7 +1679,7 @@ class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsw ...@@ -1678,7 +1679,7 @@ class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsw
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.deberta = TFDebertaV2MainLayer(config, name="deberta") self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
...@@ -1777,9 +1778,9 @@ class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceL ...@@ -1777,9 +1778,9 @@ class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceL
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.deberta = TFDebertaV2MainLayer(config, name="deberta") self.deberta = TFDebertaV2MainLayer(config, name="deberta")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.pooler = TFDebertaV2ContextPooler(config, name="pooler") self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.output_dim = self.pooler.output_dim self.output_dim = self.pooler.output_dim
......
...@@ -30,6 +30,7 @@ from ....modeling_tf_utils import ( ...@@ -30,6 +30,7 @@ from ....modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -56,7 +57,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -56,7 +57,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
class TFPositionalEmbedding(tf.keras.layers.Layer): class TFPositionalEmbedding(keras.layers.Layer):
def __init__(self, demb, **kwargs): def __init__(self, demb, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -73,7 +74,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer): ...@@ -73,7 +74,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
return pos_emb[:, None, :] return pos_emb[:, None, :]
class TFPositionwiseFF(tf.keras.layers.Layer): class TFPositionwiseFF(keras.layers.Layer):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -81,14 +82,14 @@ class TFPositionwiseFF(tf.keras.layers.Layer): ...@@ -81,14 +82,14 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
self.d_inner = d_inner self.d_inner = d_inner
self.dropout = dropout self.dropout = dropout
self.layer_1 = tf.keras.layers.Dense( self.layer_1 = keras.layers.Dense(
d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0" d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
) )
self.drop_1 = tf.keras.layers.Dropout(dropout) self.drop_1 = keras.layers.Dropout(dropout)
self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3") self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
self.drop_2 = tf.keras.layers.Dropout(dropout) self.drop_2 = keras.layers.Dropout(dropout)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
self.pre_lnorm = pre_lnorm self.pre_lnorm = pre_lnorm
...@@ -116,7 +117,7 @@ class TFPositionwiseFF(tf.keras.layers.Layer): ...@@ -116,7 +117,7 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
return output return output
class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): class TFRelPartialLearnableMultiHeadAttn(keras.layers.Layer):
def __init__( def __init__(
self, self,
n_head, n_head,
...@@ -140,17 +141,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -140,17 +141,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self.dropout = dropout self.dropout = dropout
self.output_attentions = output_attentions self.output_attentions = output_attentions
self.qkv_net = tf.keras.layers.Dense( self.qkv_net = keras.layers.Dense(
3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net" 3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net"
) )
self.drop = tf.keras.layers.Dropout(dropout) self.drop = keras.layers.Dropout(dropout)
self.dropatt = tf.keras.layers.Dropout(dropatt) self.dropatt = keras.layers.Dropout(dropatt)
self.o_net = tf.keras.layers.Dense( self.o_net = keras.layers.Dense(
d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net" d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net"
) )
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
self.scale = 1 / (d_head**0.5) self.scale = 1 / (d_head**0.5)
...@@ -163,7 +164,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -163,7 +164,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self.r_r_bias = None self.r_r_bias = None
self.r_w_bias = None self.r_w_bias = None
self.r_net = tf.keras.layers.Dense( self.r_net = keras.layers.Dense(
self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net" self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net"
) )
...@@ -268,7 +269,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): ...@@ -268,7 +269,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
return outputs return outputs
class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): class TFRelPartialLearnableDecoderLayer(keras.layers.Layer):
def __init__( def __init__(
self, self,
n_head, n_head,
...@@ -320,7 +321,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): ...@@ -320,7 +321,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
return outputs return outputs
class TFTransfoEmbeddings(tf.keras.layers.Layer): class TFTransfoEmbeddings(keras.layers.Layer):
def __init__(self, vocab_size, emb_size, init_std, **kwargs): def __init__(self, vocab_size, emb_size, init_std, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -341,7 +342,7 @@ class TFTransfoEmbeddings(tf.keras.layers.Layer): ...@@ -341,7 +342,7 @@ class TFTransfoEmbeddings(tf.keras.layers.Layer):
return tf.gather(self.weight, inputs) return tf.gather(self.weight, inputs)
class TFAdaptiveEmbedding(tf.keras.layers.Layer): class TFAdaptiveEmbedding(keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -418,7 +419,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): ...@@ -418,7 +419,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFTransfoXLMainLayer(tf.keras.layers.Layer): class TFTransfoXLMainLayer(keras.layers.Layer):
config_class = TransfoXLConfig config_class = TransfoXLConfig
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -447,7 +448,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -447,7 +448,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
name="word_emb", name="word_emb",
) )
self.drop = tf.keras.layers.Dropout(config.dropout) self.drop = keras.layers.Dropout(config.dropout)
self.n_layer = config.n_layer self.n_layer = config.n_layer
self.mem_len = config.mem_len self.mem_len = config.mem_len
...@@ -773,7 +774,7 @@ TRANSFO_XL_START_DOCSTRING = r""" ...@@ -773,7 +774,7 @@ TRANSFO_XL_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -1022,7 +1023,7 @@ class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenc ...@@ -1022,7 +1023,7 @@ class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenc
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.score = tf.keras.layers.Dense( self.score = keras.layers.Dense(
config.num_labels, config.num_labels,
kernel_initializer=get_initializer(config.init_range), kernel_initializer=get_initializer(config.init_range),
name="score", name="score",
......
...@@ -20,10 +20,11 @@ ...@@ -20,10 +20,11 @@
import tensorflow as tf import tensorflow as tf
from ....modeling_tf_utils import keras
from ....tf_utils import shape_list from ....tf_utils import shape_list
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): class TFAdaptiveSoftmaxMask(keras.layers.Layer):
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
......
...@@ -32,6 +32,7 @@ from ...modeling_tf_utils import ( ...@@ -32,6 +32,7 @@ from ...modeling_tf_utils import (
TFModelInputType, TFModelInputType,
TFPreTrainedModel, TFPreTrainedModel,
get_initializer, get_initializer,
keras,
unpack_inputs, unpack_inputs,
) )
from ...tf_utils import shape_list from ...tf_utils import shape_list
...@@ -77,7 +78,7 @@ ENCODER_DECODER_START_DOCSTRING = r""" ...@@ -77,7 +78,7 @@ ENCODER_DECODER_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -258,7 +259,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -258,7 +259,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
self.encoder.config.hidden_size != self.decoder.config.hidden_size self.encoder.config.hidden_size != self.decoder.config.hidden_size
and self.decoder.config.cross_attention_hidden_size is None and self.decoder.config.cross_attention_hidden_size is None
): ):
self.enc_to_dec_proj = tf.keras.layers.Dense( self.enc_to_dec_proj = keras.layers.Dense(
units=self.decoder.config.hidden_size, units=self.decoder.config.hidden_size,
kernel_initializer=get_initializer(config.encoder.initializer_range), kernel_initializer=get_initializer(config.encoder.initializer_range),
name="enc_to_dec_proj", name="enc_to_dec_proj",
...@@ -445,7 +446,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -445,7 +446,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix
decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
# Make sure these 2 `tf.keras.Model` have fixed names so `from_pretrained` could load model weights correctly. # Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly.
if encoder.name != "encoder": if encoder.name != "encoder":
raise ValueError("encoder model must be created with the name `encoder`.") raise ValueError("encoder model must be created with the name `encoder`.")
if decoder.name != "decoder": if decoder.name != "decoder":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment