Unverified Commit 415e9a09 authored by Matt's avatar Matt Committed by GitHub
Browse files

Add tf_keras imports to prepare for Keras 3 (#28588)

* Port core files + ESM (because ESM code is odd)

* Search-replace in modelling code

* Fix up transfo_xl as well

* Fix other core files + tests (still need to add correct import to tests)

* Fix cookiecutter

* make fixup, fix imports in some more core files

* Auto-add imports to tests

* Cleanup, add imports to sagemaker tests

* Use correct exception for importing tf_keras

* Fixes in modeling_tf_utils

* make fixup

* Correct version parsing code

* Ensure the pipeline tests correctly revert to float32 after each test

* Ensure the pipeline tests correctly revert to float32 after each test

* More tf.keras -> keras

* Add dtype cast

* Better imports of tf_keras

* Add a cast for tf.assign, just in case

* Fix callback imports
parent 1d489b3e
...@@ -46,6 +46,7 @@ from ...modeling_tf_utils import ( ...@@ -46,6 +46,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -80,7 +81,7 @@ XLM_ROBERTA_START_DOCSTRING = r""" ...@@ -80,7 +81,7 @@ XLM_ROBERTA_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -162,7 +163,7 @@ XLM_ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -162,7 +163,7 @@ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->XLMRoberta
class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): class TFXLMRobertaEmbeddings(keras.layers.Layer):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
""" """
...@@ -175,8 +176,8 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -175,8 +176,8 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -268,11 +269,11 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -268,11 +269,11 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->XLMRoberta
class TFXLMRobertaPooler(tf.keras.layers.Layer): class TFXLMRobertaPooler(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -298,7 +299,7 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer): ...@@ -298,7 +299,7 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta
class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): class TFXLMRobertaSelfAttention(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -313,16 +314,16 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -313,16 +314,16 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.config = config self.config = config
...@@ -431,15 +432,15 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -431,15 +432,15 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta
class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): class TFXLMRobertaSelfOutput(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -462,7 +463,7 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): ...@@ -462,7 +463,7 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta
class TFXLMRobertaAttention(tf.keras.layers.Layer): class TFXLMRobertaAttention(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -514,11 +515,11 @@ class TFXLMRobertaAttention(tf.keras.layers.Layer): ...@@ -514,11 +515,11 @@ class TFXLMRobertaAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta
class TFXLMRobertaIntermediate(tf.keras.layers.Layer): class TFXLMRobertaIntermediate(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -544,15 +545,15 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer): ...@@ -544,15 +545,15 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta
class TFXLMRobertaOutput(tf.keras.layers.Layer): class TFXLMRobertaOutput(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -575,7 +576,7 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer): ...@@ -575,7 +576,7 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta
class TFXLMRobertaLayer(tf.keras.layers.Layer): class TFXLMRobertaLayer(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -679,7 +680,7 @@ class TFXLMRobertaLayer(tf.keras.layers.Layer): ...@@ -679,7 +680,7 @@ class TFXLMRobertaLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta
class TFXLMRobertaEncoder(tf.keras.layers.Layer): class TFXLMRobertaEncoder(keras.layers.Layer):
def __init__(self, config: XLMRobertaConfig, **kwargs): def __init__(self, config: XLMRobertaConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -759,7 +760,7 @@ class TFXLMRobertaEncoder(tf.keras.layers.Layer): ...@@ -759,7 +760,7 @@ class TFXLMRobertaEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta
class TFXLMRobertaMainLayer(tf.keras.layers.Layer): class TFXLMRobertaMainLayer(keras.layers.Layer):
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
def __init__(self, config, add_pooling_layer=True, **kwargs): def __init__(self, config, add_pooling_layer=True, **kwargs):
...@@ -779,7 +780,7 @@ class TFXLMRobertaMainLayer(tf.keras.layers.Layer): ...@@ -779,7 +780,7 @@ class TFXLMRobertaMainLayer(tf.keras.layers.Layer):
self.embeddings = TFXLMRobertaEmbeddings(config, name="embeddings") self.embeddings = TFXLMRobertaEmbeddings(config, name="embeddings")
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
...@@ -1063,7 +1064,7 @@ class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel): ...@@ -1063,7 +1064,7 @@ class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel):
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta
class TFXLMRobertaLMHead(tf.keras.layers.Layer): class TFXLMRobertaLMHead(keras.layers.Layer):
"""XLMRoberta Head for masked language modeling.""" """XLMRoberta Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
...@@ -1071,10 +1072,10 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer): ...@@ -1071,10 +1072,10 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer):
self.config = config self.config = config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.act = get_tf_activation("gelu") self.act = get_tf_activation("gelu")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -1352,12 +1353,12 @@ class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModel ...@@ -1352,12 +1353,12 @@ class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModel
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta
class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): class TFXLMRobertaClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.hidden_size, config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -1366,8 +1367,8 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): ...@@ -1366,8 +1367,8 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(classifier_dropout) self.dropout = keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense( self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
self.config = config self.config = config
...@@ -1497,8 +1498,8 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic ...@@ -1497,8 +1498,8 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFXLMRobertaMainLayer(config, name="roberta") self.roberta = TFXLMRobertaMainLayer(config, name="roberta")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1606,8 +1607,8 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla ...@@ -1606,8 +1607,8 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(classifier_dropout) self.dropout = keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1698,7 +1699,7 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn ...@@ -1698,7 +1699,7 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta") self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
......
...@@ -39,6 +39,7 @@ from ...modeling_tf_utils import ( ...@@ -39,6 +39,7 @@ from ...modeling_tf_utils import (
TFSharedEmbeddings, TFSharedEmbeddings,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -66,7 +67,7 @@ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -66,7 +67,7 @@ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
class TFXLNetRelativeAttention(tf.keras.layers.Layer): class TFXLNetRelativeAttention(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -83,8 +84,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -83,8 +84,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.config = config self.config = config
def build(self, input_shape=None): def build(self, input_shape=None):
...@@ -336,17 +337,17 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -336,17 +337,17 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
return outputs return outputs
class TFXLNetFeedForward(tf.keras.layers.Layer): class TFXLNetFeedForward(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.layer_1 = tf.keras.layers.Dense( self.layer_1 = keras.layers.Dense(
config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1" config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
) )
self.layer_2 = tf.keras.layers.Dense( self.layer_2 = keras.layers.Dense(
config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2" config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
if isinstance(config.ff_activation, str): if isinstance(config.ff_activation, str):
self.activation_function = get_tf_activation(config.ff_activation) self.activation_function = get_tf_activation(config.ff_activation)
else: else:
...@@ -378,12 +379,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): ...@@ -378,12 +379,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
self.layer_2.build([None, None, self.config.d_inner]) self.layer_2.build([None, None, self.config.d_inner])
class TFXLNetLayer(tf.keras.layers.Layer): class TFXLNetLayer(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn") self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
self.ff = TFXLNetFeedForward(config, name="ff") self.ff = TFXLNetFeedForward(config, name="ff")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
def call( def call(
self, self,
...@@ -433,7 +434,7 @@ class TFXLNetLayer(tf.keras.layers.Layer): ...@@ -433,7 +434,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
self.ff.build(None) self.ff.build(None)
class TFXLNetLMHead(tf.keras.layers.Layer): class TFXLNetLMHead(keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -466,7 +467,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): ...@@ -466,7 +467,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFXLNetMainLayer(tf.keras.layers.Layer): class TFXLNetMainLayer(keras.layers.Layer):
config_class = XLNetConfig config_class = XLNetConfig
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -492,7 +493,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -492,7 +493,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding" config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
) )
self.layer = [TFXLNetLayer(config, name=f"layer_._{i}") for i in range(config.n_layer)] self.layer = [TFXLNetLayer(config, name=f"layer_._{i}") for i in range(config.n_layer)]
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.use_mems_eval = config.use_mems_eval self.use_mems_eval = config.use_mems_eval
self.use_mems_train = config.use_mems_train self.use_mems_train = config.use_mems_train
...@@ -1059,7 +1060,7 @@ XLNET_START_DOCSTRING = r""" ...@@ -1059,7 +1060,7 @@ XLNET_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -1415,7 +1416,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1415,7 +1416,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
self.sequence_summary = TFSequenceSummary( self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary" config, initializer_range=config.initializer_range, name="sequence_summary"
) )
self.logits_proj = tf.keras.layers.Dense( self.logits_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
) )
self.config = config self.config = config
...@@ -1516,7 +1517,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1516,7 +1517,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
self.sequence_summary = TFSequenceSummary( self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary" config, initializer_range=config.initializer_range, name="sequence_summary"
) )
self.logits_proj = tf.keras.layers.Dense( self.logits_proj = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
) )
self.config = config self.config = config
...@@ -1630,7 +1631,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1630,7 +1631,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1720,7 +1721,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1720,7 +1721,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
......
...@@ -22,12 +22,14 @@ import tensorflow as tf ...@@ -22,12 +22,14 @@ import tensorflow as tf
try: try:
from tf_keras.optimizers.legacy import Adam
except (ImportError, ModuleNotFoundError):
from tensorflow.keras.optimizers.legacy import Adam from tensorflow.keras.optimizers.legacy import Adam
except ImportError:
from tensorflow.keras.optimizers import Adam
from .modeling_tf_utils import keras
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
class WarmUp(keras.optimizers.schedules.LearningRateSchedule):
""" """
Applies a warmup schedule on a given learning rate decay schedule. Applies a warmup schedule on a given learning rate decay schedule.
...@@ -131,7 +133,7 @@ def create_optimizer( ...@@ -131,7 +133,7 @@ def create_optimizer(
applied to all parameters except bias and layer norm parameters. applied to all parameters except bias and layer norm parameters.
""" """
# Implements linear decay of the learning rate. # Implements linear decay of the learning rate.
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( lr_schedule = keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr, initial_learning_rate=init_lr,
decay_steps=num_train_steps - num_warmup_steps, decay_steps=num_train_steps - num_warmup_steps,
end_learning_rate=init_lr * min_lr_ratio, end_learning_rate=init_lr * min_lr_ratio,
...@@ -156,7 +158,7 @@ def create_optimizer( ...@@ -156,7 +158,7 @@ def create_optimizer(
include_in_weight_decay=include_in_weight_decay, include_in_weight_decay=include_in_weight_decay,
) )
else: else:
optimizer = tf.keras.optimizers.Adam( optimizer = keras.optimizers.Adam(
learning_rate=lr_schedule, learning_rate=lr_schedule,
beta_1=adam_beta1, beta_1=adam_beta1,
beta_2=adam_beta2, beta_2=adam_beta2,
...@@ -180,7 +182,7 @@ class AdamWeightDecay(Adam): ...@@ -180,7 +182,7 @@ class AdamWeightDecay(Adam):
to adding the square of the weights to the loss with plain (non-momentum) SGD. to adding the square of the weights to the loss with plain (non-momentum) SGD.
Args: Args:
learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 0.001): learning_rate (`Union[float, keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 0.001):
The learning rate to use or a schedule. The learning rate to use or a schedule.
beta_1 (`float`, *optional*, defaults to 0.9): beta_1 (`float`, *optional*, defaults to 0.9):
The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates. The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
...@@ -210,7 +212,7 @@ class AdamWeightDecay(Adam): ...@@ -210,7 +212,7 @@ class AdamWeightDecay(Adam):
def __init__( def __init__(
self, self,
learning_rate: Union[float, tf.keras.optimizers.schedules.LearningRateSchedule] = 0.001, learning_rate: Union[float, keras.optimizers.schedules.LearningRateSchedule] = 0.001,
beta_1: float = 0.9, beta_1: float = 0.9,
beta_2: float = 0.999, beta_2: float = 0.999,
epsilon: float = 1e-7, epsilon: float = 1e-7,
......
...@@ -25,6 +25,8 @@ logger = logging.get_logger(__name__) ...@@ -25,6 +25,8 @@ logger = logging.get_logger(__name__)
if is_tf_available(): if is_tf_available():
import tensorflow as tf import tensorflow as tf
from .modeling_tf_utils import keras
@dataclass @dataclass
class TFTrainingArguments(TrainingArguments): class TFTrainingArguments(TrainingArguments):
...@@ -195,7 +197,7 @@ class TFTrainingArguments(TrainingArguments): ...@@ -195,7 +197,7 @@ class TFTrainingArguments(TrainingArguments):
# Set to float16 at first # Set to float16 at first
if self.fp16: if self.fp16:
tf.keras.mixed_precision.set_global_policy("mixed_float16") keras.mixed_precision.set_global_policy("mixed_float16")
if self.no_cuda: if self.no_cuda:
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
...@@ -216,7 +218,7 @@ class TFTrainingArguments(TrainingArguments): ...@@ -216,7 +218,7 @@ class TFTrainingArguments(TrainingArguments):
if tpu: if tpu:
# Set to bfloat16 in case of TPU # Set to bfloat16 in case of TPU
if self.fp16: if self.fp16:
tf.keras.mixed_precision.set_global_policy("mixed_bfloat16") keras.mixed_precision.set_global_policy("mixed_bfloat16")
tf.config.experimental_connect_to_cluster(tpu) tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu) tf.tpu.experimental.initialize_tpu_system(tpu)
......
...@@ -50,6 +50,7 @@ from ...modeling_tf_utils import ( ...@@ -50,6 +50,7 @@ from ...modeling_tf_utils import (
TFSequenceSummary, TFSequenceSummary,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -70,7 +71,7 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -70,7 +71,7 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Embeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
...@@ -81,8 +82,8 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): ...@@ -81,8 +82,8 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape): def build(self, input_shape: tf.TensorShape):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -149,7 +150,7 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): ...@@ -149,7 +150,7 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}SelfAttention(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -164,16 +165,16 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) ...@@ -164,16 +165,16 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
...@@ -267,15 +268,15 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) ...@@ -267,15 +268,15 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}SelfOutput(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -286,7 +287,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): ...@@ -286,7 +287,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Attention(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -327,11 +328,11 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): ...@@ -327,11 +328,11 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Intermediate(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -348,15 +349,15 @@ class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): ...@@ -348,15 +349,15 @@ class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Output(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -367,7 +368,7 @@ class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): ...@@ -367,7 +368,7 @@ class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Layer(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -454,7 +455,7 @@ class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer): ...@@ -454,7 +455,7 @@ class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Encoder(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -524,11 +525,11 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): ...@@ -524,11 +525,11 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
...@@ -539,7 +540,7 @@ class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.lay ...@@ -539,7 +540,7 @@ class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.lay
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -550,8 +551,8 @@ class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.lay ...@@ -550,8 +551,8 @@ class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.lay
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
...@@ -568,7 +569,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -568,7 +569,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable): def set_output_embeddings(self, value: tf.Variable):
...@@ -594,8 +595,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -594,8 +595,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}MLMHead(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions") self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions")
...@@ -607,7 +608,7 @@ class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): ...@@ -607,7 +608,7 @@ class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}MainLayer(keras.layers.Layer):
config_class = {{cookiecutter.camelcase_modelname}}Config config_class = {{cookiecutter.camelcase_modelname}}Config
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, add_pooling_layer: bool = True, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, add_pooling_layer: bool = True, **kwargs):
...@@ -620,7 +621,7 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): ...@@ -620,7 +621,7 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer):
self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder") self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder")
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
...@@ -811,7 +812,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): ...@@ -811,7 +812,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
generic methods the library implements for all its model (such as downloading or saving, resizing the input generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.) embeddings, pruning heads etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass.
Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general
usage and behavior. usage and behavior.
...@@ -991,7 +992,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ...@@ -991,7 +992,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
@unpack_inputs @unpack_inputs
...@@ -1064,7 +1065,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca ...@@ -1064,7 +1065,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, input_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
def prepare_inputs_for_generation(self, inputs, past_key_values=None, attention_mask=None, **model_kwargs): def prepare_inputs_for_generation(self, inputs, past_key_values=None, attention_mask=None, **model_kwargs):
...@@ -1166,17 +1167,17 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca ...@@ -1166,17 +1167,17 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca
class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.out_proj = tf.keras.layers.Dense( self.out_proj = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
...@@ -1277,7 +1278,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c ...@@ -1277,7 +1278,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c
self.sequence_summary = TFSequenceSummary( self.sequence_summary = TFSequenceSummary(
config, config.initializer_range, name="sequence_summary" config, config.initializer_range, name="sequence_summary"
) )
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
...@@ -1383,8 +1384,8 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut ...@@ -1383,8 +1384,8 @@ class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecut
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
...@@ -1456,7 +1457,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte ...@@ -1456,7 +1457,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
...@@ -1623,7 +1624,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -1623,7 +1624,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
return (one_cst - expanded_mask) * LARGE_NEGATIVE return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(tf.keras.layers.Embedding): class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(keras.layers.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
""" """
...@@ -1639,7 +1640,7 @@ class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(tf.keras. ...@@ -1639,7 +1640,7 @@ class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(tf.keras.
return super().call(tf.cast(position_ids, dtype=tf.int32)) return super().call(tf.cast(position_ids, dtype=tf.int32))
class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Attention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""" """Multi-headed attention from "Attention Is All You Need"""
def __init__( def __init__(
...@@ -1655,16 +1656,16 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): ...@@ -1655,16 +1656,16 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
...@@ -1776,20 +1777,20 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): ...@@ -1776,20 +1777,20 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value return attn_output, attn_weights, past_key_value
class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
self.self_attn = TF{{cookiecutter.camelcase_modelname}}Attention( self.self_attn = TF{{cookiecutter.camelcase_modelname}}Attention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
) )
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False): def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
""" """
...@@ -1826,7 +1827,7 @@ class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(tf.keras.layers.Layer): ...@@ -1826,7 +1827,7 @@ class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights return hidden_states, self_attn_weights
class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(keras.layers.Layer):
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
...@@ -1837,11 +1838,11 @@ class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): ...@@ -1837,11 +1838,11 @@ class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer):
name="self_attn", name="self_attn",
is_decoder=True, is_decoder=True,
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TF{{cookiecutter.camelcase_modelname}}Attention( self.encoder_attn = TF{{cookiecutter.camelcase_modelname}}Attention(
self.embed_dim, self.embed_dim,
config.decoder_attention_heads, config.decoder_attention_heads,
...@@ -1849,10 +1850,10 @@ class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): ...@@ -1849,10 +1850,10 @@ class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer):
name="encoder_attn", name="encoder_attn",
is_decoder=True, is_decoder=True,
) )
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
def call( def call(
self, self,
...@@ -1944,7 +1945,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): ...@@ -1944,7 +1945,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
generic methods the library implements for all its model (such as downloading or saving, resizing the input generic methods the library implements for all its model (such as downloading or saving, resizing the input
embeddings, pruning heads etc.) embeddings, pruning heads etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
and behavior. and behavior.
...@@ -2062,7 +2063,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): ...@@ -2062,7 +2063,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
@keras_serializable @keras_serializable
class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Encoder(keras.layers.Layer):
config_class = {{cookiecutter.camelcase_modelname}}Config config_class = {{cookiecutter.camelcase_modelname}}Config
""" """
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
...@@ -2072,10 +2073,10 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): ...@@ -2072,10 +2073,10 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
config: {{cookiecutter.camelcase_modelname}}Config config: {{cookiecutter.camelcase_modelname}}Config
""" """
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings self.max_source_positions = config.max_position_embeddings
...@@ -2088,7 +2089,7 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): ...@@ -2088,7 +2089,7 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
name="embed_positions", name="embed_positions",
) )
self.layers = [TF{{cookiecutter.camelcase_modelname}}EncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TF{{cookiecutter.camelcase_modelname}}EncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -2215,7 +2216,7 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): ...@@ -2215,7 +2216,7 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}Decoder(keras.layers.Layer):
config_class = {{cookiecutter.camelcase_modelname}}Config config_class = {{cookiecutter.camelcase_modelname}}Config
""" """
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TF{{cookiecutter.camelcase_modelname}}DecoderLayer`] Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TF{{cookiecutter.camelcase_modelname}}DecoderLayer`]
...@@ -2225,7 +2226,7 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): ...@@ -2225,7 +2226,7 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer):
embed_tokens: output embedding embed_tokens: output embedding
""" """
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
...@@ -2238,9 +2239,9 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): ...@@ -2238,9 +2239,9 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer):
) )
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TF{{cookiecutter.camelcase_modelname}}DecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] self.layers = [TF{{cookiecutter.camelcase_modelname}}DecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -2458,17 +2459,17 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): ...@@ -2458,17 +2459,17 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}MainLayer(keras.layers.Layer):
config_class = {{cookiecutter.camelcase_modelname}}Config config_class = {{cookiecutter.camelcase_modelname}}Config
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.shared = tf.keras.layers.Embedding( self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.d_model, output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared" name="model.shared"
) )
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
...@@ -2637,9 +2638,9 @@ class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_mod ...@@ -2637,9 +2638,9 @@ class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_mod
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
...@@ -2811,9 +2812,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec ...@@ -2811,9 +2812,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
def hf_compute_loss(self, labels, logits): def hf_compute_loss(self, labels, logits):
"""CrossEntropyLoss that ignores pad tokens""" """CrossEntropyLoss that ignores pad tokens"""
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(
from_logits=True, from_logits=True,
reduction=tf.keras.losses.Reduction.NONE, reduction=keras.losses.Reduction.NONE,
) )
melted_labels = tf.reshape(labels, (-1,)) melted_labels = tf.reshape(labels, (-1,))
active_loss = tf.not_equal(melted_labels, self.config.pad_token_id) active_loss = tf.not_equal(melted_labels, self.config.pad_token_id)
......
...@@ -43,6 +43,7 @@ if is_tf_available(): ...@@ -43,6 +43,7 @@ if is_tf_available():
TFMinLengthLogitsProcessor, TFMinLengthLogitsProcessor,
tf_top_k_top_p_filtering, tf_top_k_top_p_filtering,
) )
from transformers.modeling_tf_utils import keras
if is_tensorflow_text_available(): if is_tensorflow_text_available():
import tensorflow_text as text import tensorflow_text as text
...@@ -254,7 +255,7 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests ...@@ -254,7 +255,7 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests
# file needed to load the TF tokenizer # file needed to load the TF tokenizer
hf_hub_download(repo_id="google/flan-t5-small", filename="spiece.model", local_dir=tmp_dir) hf_hub_download(repo_id="google/flan-t5-small", filename="spiece.model", local_dir=tmp_dir)
class CompleteSentenceTransformer(tf.keras.layers.Layer): class CompleteSentenceTransformer(keras.layers.Layer):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.tokenizer = text.SentencepieceTokenizer( self.tokenizer = text.SentencepieceTokenizer(
...@@ -271,9 +272,9 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests ...@@ -271,9 +272,9 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests
return self.tokenizer.detokenize(outputs) return self.tokenizer.detokenize(outputs)
complete_model = CompleteSentenceTransformer() complete_model = CompleteSentenceTransformer()
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name="inputs") inputs = keras.layers.Input(shape=(1,), dtype=tf.string, name="inputs")
outputs = complete_model(inputs) outputs = complete_model(inputs)
keras_model = tf.keras.Model(inputs, outputs) keras_model = keras.Model(inputs, outputs)
keras_model.save(tmp_dir) keras_model.save(tmp_dir)
def test_eos_token_id_int_and_list_top_k_top_sampling(self): def test_eos_token_id_int_and_list_top_k_top_sampling(self):
......
...@@ -10,6 +10,8 @@ from transformers.testing_utils import require_tensorflow_text, require_tf, slow ...@@ -10,6 +10,8 @@ from transformers.testing_utils import require_tensorflow_text, require_tf, slow
if is_tf_available(): if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers.modeling_tf_utils import keras
if is_tensorflow_text_available(): if is_tensorflow_text_available():
from transformers.models.bert import TFBertTokenizer from transformers.models.bert import TFBertTokenizer
...@@ -18,8 +20,9 @@ TOKENIZER_CHECKPOINTS = ["bert-base-uncased", "bert-base-cased"] ...@@ -18,8 +20,9 @@ TOKENIZER_CHECKPOINTS = ["bert-base-uncased", "bert-base-cased"]
TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only" TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only"
if is_tf_available(): if is_tf_available():
from transformers.modeling_tf_utils import keras
class ModelToSave(tf.keras.Model): class ModelToSave(keras.Model):
def __init__(self, tokenizer): def __init__(self, tokenizer):
super().__init__() super().__init__()
self.tokenizer = tokenizer self.tokenizer = tokenizer
......
...@@ -44,6 +44,7 @@ if is_tf_available(): ...@@ -44,6 +44,7 @@ if is_tf_available():
TFBlipTextModel, TFBlipTextModel,
TFBlipVisionModel, TFBlipVisionModel,
) )
from transformers.modeling_tf_utils import keras
from transformers.models.blip.modeling_tf_blip import TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.models.blip.modeling_tf_blip import TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST
...@@ -172,9 +173,9 @@ class TFBlipVisionModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -172,9 +173,9 @@ class TFBlipVisionModelTest(TFModelTesterMixin, unittest.TestCase):
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
x = model.get_output_embeddings() x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
def test_model(self): def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
......
...@@ -38,6 +38,7 @@ if is_tf_available(): ...@@ -38,6 +38,7 @@ if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings
from transformers.modeling_tf_utils import keras
from transformers.models.clip.modeling_tf_clip import TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.models.clip.modeling_tf_clip import TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
...@@ -151,9 +152,9 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -151,9 +152,9 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
x = model.get_output_embeddings() x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
def test_forward_signature(self): def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -283,7 +284,7 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -283,7 +284,7 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, saved_model=True) model.save_pretrained(tmpdirname, saved_model=True)
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
model = tf.keras.models.load_model(saved_model_dir) model = keras.models.load_model(saved_model_dir)
outputs = model(class_inputs_dict) outputs = model(class_inputs_dict)
output_hidden_states = outputs["hidden_states"] output_hidden_states = outputs["hidden_states"]
output_attentions = outputs["attentions"] output_attentions = outputs["attentions"]
...@@ -443,7 +444,7 @@ class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -443,7 +444,7 @@ class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, saved_model=True) model.save_pretrained(tmpdirname, saved_model=True)
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
model = tf.keras.models.load_model(saved_model_dir) model = keras.models.load_model(saved_model_dir)
outputs = model(class_inputs_dict) outputs = model(class_inputs_dict)
output_hidden_states = outputs["hidden_states"] output_hidden_states = outputs["hidden_states"]
output_attentions = outputs["attentions"] output_attentions = outputs["attentions"]
...@@ -565,7 +566,7 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -565,7 +566,7 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
for module_member in (getattr(module, module_member_name),) for module_member in (getattr(module, module_member_name),)
if isinstance(module_member, type) if isinstance(module_member, type)
and tf.keras.layers.Layer in module_member.__bases__ and keras.layers.Layer in module_member.__bases__
and getattr(module_member, "_keras_serializable", False) and getattr(module_member, "_keras_serializable", False)
} }
for main_layer_class in tf_main_layer_classes: for main_layer_class in tf_main_layer_classes:
...@@ -579,17 +580,17 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -579,17 +580,17 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
main_layer = main_layer_class(config) main_layer = main_layer_class(config)
symbolic_inputs = { symbolic_inputs = {
name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
} }
model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
outputs = model(inputs_dict) outputs = model(inputs_dict)
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "keras_model.h5") filepath = os.path.join(tmpdirname, "keras_model.h5")
model.save(filepath) model.save(filepath)
if "T5" in main_layer_class.__name__: if "T5" in main_layer_class.__name__:
model = tf.keras.models.load_model( model = keras.models.load_model(
filepath, filepath,
custom_objects={ custom_objects={
main_layer_class.__name__: main_layer_class, main_layer_class.__name__: main_layer_class,
...@@ -597,10 +598,10 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -597,10 +598,10 @@ class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
}, },
) )
else: else:
model = tf.keras.models.load_model( model = keras.models.load_model(
filepath, custom_objects={main_layer_class.__name__: main_layer_class} filepath, custom_objects={main_layer_class.__name__: main_layer_class}
) )
assert isinstance(model, tf.keras.Model) assert isinstance(model, keras.Model)
after_outputs = model(inputs_dict) after_outputs = model(inputs_dict)
self.assert_outputs_same(after_outputs, outputs) self.assert_outputs_same(after_outputs, outputs)
......
...@@ -37,6 +37,7 @@ if is_tf_available(): ...@@ -37,6 +37,7 @@ if is_tf_available():
TFConvBertForTokenClassification, TFConvBertForTokenClassification,
TFConvBertModel, TFConvBertModel,
) )
from transformers.modeling_tf_utils import keras
class TFConvBertModelTester: class TFConvBertModelTester:
...@@ -306,7 +307,7 @@ class TFConvBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test ...@@ -306,7 +307,7 @@ class TFConvBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, saved_model=True) model.save_pretrained(tmpdirname, saved_model=True)
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
model = tf.keras.models.load_model(saved_model_dir) model = keras.models.load_model(saved_model_dir)
outputs = model(class_inputs_dict) outputs = model(class_inputs_dict)
if self.is_encoder_decoder: if self.is_encoder_decoder:
......
...@@ -29,6 +29,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin ...@@ -29,6 +29,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
if is_tf_available(): if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers.modeling_tf_utils import keras
from transformers.models.ctrl.modeling_tf_ctrl import ( from transformers.models.ctrl.modeling_tf_ctrl import (
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCTRLForSequenceClassification, TFCTRLForSequenceClassification,
...@@ -226,18 +227,18 @@ class TFCTRLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -226,18 +227,18 @@ class TFCTRLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
model.build_in_name_scope() # may be needed for the get_bias() call below model.build_in_name_scope() # may be needed for the get_bias() call below
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) assert isinstance(model.get_input_embeddings(), keras.layers.Layer)
if model_class in list_lm_models: if model_class in list_lm_models:
x = model.get_output_embeddings() x = model.get_output_embeddings()
assert isinstance(x, tf.keras.layers.Layer) assert isinstance(x, keras.layers.Layer)
name = model.get_bias() name = model.get_bias()
assert isinstance(name, dict) assert isinstance(name, dict)
for k, v in name.items(): for k, v in name.items():
assert isinstance(v, tf.Variable) assert isinstance(v, tf.Variable)
elif model_class in list_other_models_with_output_ebd: elif model_class in list_other_models_with_output_ebd:
x = model.get_output_embeddings() x = model.get_output_embeddings()
assert isinstance(x, tf.keras.layers.Layer) assert isinstance(x, keras.layers.Layer)
name = model.get_bias() name = model.get_bias()
assert name is None assert name is None
else: else:
......
...@@ -22,6 +22,7 @@ if is_tf_available(): ...@@ -22,6 +22,7 @@ if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers import TFCvtForImageClassification, TFCvtModel from transformers import TFCvtForImageClassification, TFCvtModel
from transformers.modeling_tf_utils import keras
from transformers.models.cvt.modeling_tf_cvt import TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.models.cvt.modeling_tf_cvt import TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST
...@@ -191,10 +192,10 @@ class TFCvtModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) ...@@ -191,10 +192,10 @@ class TFCvtModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
@unittest.skip(reason="Get `Failed to determine best cudnn convolution algo.` error after using TF 2.12+cuda 11.8") @unittest.skip(reason="Get `Failed to determine best cudnn convolution algo.` error after using TF 2.12+cuda 11.8")
def test_keras_fit_mixed_precision(self): def test_keras_fit_mixed_precision(self):
policy = tf.keras.mixed_precision.Policy("mixed_float16") policy = keras.mixed_precision.Policy("mixed_float16")
tf.keras.mixed_precision.set_global_policy(policy) keras.mixed_precision.set_global_policy(policy)
super().test_keras_fit() super().test_keras_fit()
tf.keras.mixed_precision.set_global_policy("float32") keras.mixed_precision.set_global_policy("float32")
def test_forward_signature(self): def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
......
...@@ -39,6 +39,7 @@ if is_tf_available(): ...@@ -39,6 +39,7 @@ if is_tf_available():
TFData2VecVisionForSemanticSegmentation, TFData2VecVisionForSemanticSegmentation,
TFData2VecVisionModel, TFData2VecVisionModel,
) )
from transformers.modeling_tf_utils import keras
from transformers.models.data2vec.modeling_tf_data2vec_vision import ( from transformers.models.data2vec.modeling_tf_data2vec_vision import (
TF_DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST, TF_DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
...@@ -216,9 +217,9 @@ class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittes ...@@ -216,9 +217,9 @@ class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittes
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
x = model.get_output_embeddings() x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
def test_forward_signature(self): def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -365,7 +366,7 @@ class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittes ...@@ -365,7 +366,7 @@ class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittes
key: val for key, val in prepared_for_class.items() if key not in label_names key: val for key, val in prepared_for_class.items() if key not in label_names
} }
self.assertGreater(len(inputs_minus_labels), 0) self.assertGreater(len(inputs_minus_labels), 0)
model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True) model.compile(optimizer=keras.optimizers.SGD(0.0), run_eagerly=True)
# Make sure the model fits without crashing regardless of where we pass the labels # Make sure the model fits without crashing regardless of where we pass the labels
history1 = model.fit( history1 = model.fit(
......
...@@ -40,6 +40,7 @@ if is_tf_available(): ...@@ -40,6 +40,7 @@ if is_tf_available():
TFDeiTForMaskedImageModeling, TFDeiTForMaskedImageModeling,
TFDeiTModel, TFDeiTModel,
) )
from transformers.modeling_tf_utils import keras
from transformers.models.deit.modeling_tf_deit import TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.models.deit.modeling_tf_deit import TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST
...@@ -211,9 +212,9 @@ class TFDeiTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -211,9 +212,9 @@ class TFDeiTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
x = model.get_output_embeddings() x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense)) self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
def test_forward_signature(self): def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
......
...@@ -37,6 +37,7 @@ if is_tf_available(): ...@@ -37,6 +37,7 @@ if is_tf_available():
TFEfficientFormerForImageClassificationWithTeacher, TFEfficientFormerForImageClassificationWithTeacher,
TFEfficientFormerModel, TFEfficientFormerModel,
) )
from transformers.modeling_tf_utils import keras
from transformers.models.efficientformer.modeling_tf_efficientformer import ( from transformers.models.efficientformer.modeling_tf_efficientformer import (
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
...@@ -355,7 +356,7 @@ class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unitte ...@@ -355,7 +356,7 @@ class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unitte
# These are maximally general inputs for the model, with multiple None dimensions # These are maximally general inputs for the model, with multiple None dimensions
# Hopefully this will catch any conditionals that fail for flexible shapes # Hopefully this will catch any conditionals that fail for flexible shapes
functional_inputs = { functional_inputs = {
key: tf.keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key) key: keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
for key, val in model.input_signature.items() for key, val in model.input_signature.items()
if key in model.dummy_inputs if key in model.dummy_inputs
} }
......
...@@ -509,7 +509,7 @@ class TFEncoderDecoderMixin: ...@@ -509,7 +509,7 @@ class TFEncoderDecoderMixin:
tf_outputs = tf_model(tf_inputs_dict) tf_outputs = tf_model(tf_inputs_dict)
# tf models returned loss is usually a tensor rather than a scalar. # tf models returned loss is usually a tensor rather than a scalar.
# (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`) # (see `hf_compute_loss`: it uses `keras.losses.Reduction.NONE`)
# Change it here to a scalar to match PyTorch models' loss # Change it here to a scalar to match PyTorch models' loss
tf_loss = getattr(tf_outputs, "loss", None) tf_loss = getattr(tf_outputs, "loss", None)
if tf_loss is not None: if tf_loss is not None:
......
...@@ -30,6 +30,7 @@ if is_tf_available(): ...@@ -30,6 +30,7 @@ if is_tf_available():
import numpy import numpy
import tensorflow as tf import tensorflow as tf
from transformers.modeling_tf_utils import keras
from transformers.models.esm.modeling_tf_esm import ( from transformers.models.esm.modeling_tf_esm import (
TF_ESM_PRETRAINED_MODEL_ARCHIVE_LIST, TF_ESM_PRETRAINED_MODEL_ARCHIVE_LIST,
TFEsmForMaskedLM, TFEsmForMaskedLM,
...@@ -269,7 +270,7 @@ class TFEsmModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) ...@@ -269,7 +270,7 @@ class TFEsmModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) assert isinstance(model.get_input_embeddings(), keras.layers.Layer)
if model_class is TFEsmForMaskedLM: if model_class is TFEsmForMaskedLM:
# Output embedding test differs from the main test because they're a matrix, not a layer # Output embedding test differs from the main test because they're a matrix, not a layer
name = model.get_bias() name = model.get_bias()
......
...@@ -10,6 +10,7 @@ from transformers.testing_utils import require_keras_nlp, require_tf, slow ...@@ -10,6 +10,7 @@ from transformers.testing_utils import require_keras_nlp, require_tf, slow
if is_tf_available(): if is_tf_available():
import tensorflow as tf import tensorflow as tf
if is_keras_nlp_available(): if is_keras_nlp_available():
from transformers.models.gpt2 import TFGPT2Tokenizer from transformers.models.gpt2 import TFGPT2Tokenizer
......
...@@ -46,6 +46,7 @@ if is_tf_available(): ...@@ -46,6 +46,7 @@ if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers import TFGroupViTModel, TFGroupViTTextModel, TFGroupViTVisionModel, TFSharedEmbeddings from transformers import TFGroupViTModel, TFGroupViTTextModel, TFGroupViTVisionModel, TFSharedEmbeddings
from transformers.modeling_tf_utils import keras
from transformers.models.groupvit.modeling_tf_groupvit import TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.models.groupvit.modeling_tf_groupvit import TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST
...@@ -186,9 +187,9 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -186,9 +187,9 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase):
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
x = model.get_output_embeddings() x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
def test_forward_signature(self): def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -340,7 +341,7 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -340,7 +341,7 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase):
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, saved_model=True) model.save_pretrained(tmpdirname, saved_model=True)
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
model = tf.keras.models.load_model(saved_model_dir) model = keras.models.load_model(saved_model_dir)
outputs = model(class_inputs_dict) outputs = model(class_inputs_dict)
output_hidden_states = outputs["hidden_states"] output_hidden_states = outputs["hidden_states"]
output_attentions = outputs["attentions"] output_attentions = outputs["attentions"]
...@@ -505,7 +506,7 @@ class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -505,7 +506,7 @@ class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase):
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, saved_model=True) model.save_pretrained(tmpdirname, saved_model=True)
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
model = tf.keras.models.load_model(saved_model_dir) model = keras.models.load_model(saved_model_dir)
outputs = model(class_inputs_dict) outputs = model(class_inputs_dict)
output_hidden_states = outputs["hidden_states"] output_hidden_states = outputs["hidden_states"]
output_attentions = outputs["attentions"] output_attentions = outputs["attentions"]
...@@ -655,7 +656,7 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test ...@@ -655,7 +656,7 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
for module_member in (getattr(module, module_member_name),) for module_member in (getattr(module, module_member_name),)
if isinstance(module_member, type) if isinstance(module_member, type)
and tf.keras.layers.Layer in module_member.__bases__ and keras.layers.Layer in module_member.__bases__
and getattr(module_member, "_keras_serializable", False) and getattr(module_member, "_keras_serializable", False)
} }
for main_layer_class in tf_main_layer_classes: for main_layer_class in tf_main_layer_classes:
...@@ -669,17 +670,17 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test ...@@ -669,17 +670,17 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
main_layer = main_layer_class(config) main_layer = main_layer_class(config)
symbolic_inputs = { symbolic_inputs = {
name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
} }
model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
outputs = model(inputs_dict) outputs = model(inputs_dict)
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "keras_model.h5") filepath = os.path.join(tmpdirname, "keras_model.h5")
model.save(filepath) model.save(filepath)
if "T5" in main_layer_class.__name__: if "T5" in main_layer_class.__name__:
model = tf.keras.models.load_model( model = keras.models.load_model(
filepath, filepath,
custom_objects={ custom_objects={
main_layer_class.__name__: main_layer_class, main_layer_class.__name__: main_layer_class,
...@@ -687,10 +688,10 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test ...@@ -687,10 +688,10 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
}, },
) )
else: else:
model = tf.keras.models.load_model( model = keras.models.load_model(
filepath, custom_objects={main_layer_class.__name__: main_layer_class} filepath, custom_objects={main_layer_class.__name__: main_layer_class}
) )
assert isinstance(model, tf.keras.Model) assert isinstance(model, keras.Model)
after_outputs = model(inputs_dict) after_outputs = model(inputs_dict)
self.assert_outputs_same(after_outputs, outputs) self.assert_outputs_same(after_outputs, outputs)
......
...@@ -36,6 +36,7 @@ if is_tf_available(): ...@@ -36,6 +36,7 @@ if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers import SamProcessor, TFSamModel from transformers import SamProcessor, TFSamModel
from transformers.modeling_tf_utils import keras
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
...@@ -322,9 +323,9 @@ class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) ...@@ -322,9 +323,9 @@ class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
x = model.get_output_embeddings() x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense)) self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
def test_forward_signature(self): def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment