Unverified Commit 050e0b44 authored by Matt's avatar Matt Committed by GitHub
Browse files

Proper build() methods for TF (#27794)

* Add a convenience method for building in your own name scope

* Second attempt at auto layer building

* Revert "Second attempt at auto layer building"

This reverts commit e03a3aaecf9ec41a805582b83cbdfe3290a631be.

* Attempt #3

* Revert "Attempt #3"

This reverts commit b9df7a0857560d29b5abbed6127d9e9eca77cf47.

* Add missing attributes that we're going to need later

* Add some attributes we're going to need later

* A fourth attempt! Feel the power flow through you!

* Revert "A fourth attempt! Feel the power flow through you!"

This reverts commit 6bf4aaf3875d6f28485f50187617a4c616c8aff7.

* Add more values we'll need later

* TF refactor that we'll need later

* Revert "TF refactor that we'll need later"

This reverts commit ca07202fb5b7b7436b893baa8d688b4f348ea7b9.

* Revert "Revert "TF refactor that we'll need later""

This reverts commit 1beb0f39f293ed9c27594575e1c849aadeb15c13.

* make fixup

* Attempt five!

* Revert "Attempt five!"

This reverts commit 3302207958dfd0374b0447a51c06eea51a506044.

* Attempt six - this time don't add empty methods

* Revert "Attempt six - this time don't add empty methods"

This reverts commit 67d60129be75416b6beb8f47c7d38d77b18d79bb.

* Attempt seven - better base model class detection!

* Revert "Attempt seven - better base model class detection!"

This reverts commit 5f14845e92ea0e87c598da933bfbfee10f553bc9.

* Another attribute we'll need later

* Try again with the missing attribute!

* Revert "Try again with the missing attribute!"

This reverts commit 760c6f30c5dffb3e04b0e73c34a77d1882a0fef7.

* This is the attempt that will pierce the heavens!

* Revert "This is the attempt that will pierce the heavens!"

This reverts commit c868bb657de057aca7a5260350a3f831fc4dfee6.

* Attempt seven - snag list is steadily decreasing

* Revert "Attempt seven - snag list is steadily decreasing"

This reverts commit 46fbd975deda64429bfb3e5fac4fc0370c00d316.

* Attempt eight - will an empty snag list do it?

* Revert "Attempt eight - will an empty snag list do it?"

This reverts commit 7c8a3c2b083253649569e9877e02054ae5cec67b.

* Fixes to Hubert issues that cause problems later

* Trying again with Conv1D/SeparableConv fixes

* Revert "Trying again with Conv1D/SeparableConv fixes"

This reverts commit 55092bca952bc0f750aa1ffe246a640bf1e2036e.

* Apply the build shape fixes to Wav2Vec2 as well

* One more attempt!

* Revert "One more attempt!"

This reverts commit 5ac3e4cb01b9458cc93312873725f9444ae7261c.

* Another attempt!

* Revert "Another attempt!"

This reverts commit ea16d890e019d7de8792a3b8e72f3b1c02adae50.

* Let's see how many failures we get without the internal build method

* Fix OpenAI

* Fix MobileBERT

* (Mostly) fix GroupVIT

* Fix BLIP

* One more BLIP fix

* One more BLIP fix!

* Fix Regnet

* Finally fully fix GroupViT

* Fix Data2Vec and add the new AdaptivePool

* Fix Segformer

* Fix Albert

* Fix Deberta/DebertaV2

* Fix XLM

* Actually fix XLM

* Fix Flaubert

* Fix lxmert

* Fix Resnet

* Fix ConvBERT

* Fix ESM

* Fix Convnext / ConvnextV2

* Fix SAM

* Fix Efficientformer

* Fix LayoutLMv3

* Fix speech_to_text

* Fix mpnet and mobilevit

* Fix Swin

* Fix CTRL

* Fix CVT

* Fix DPR

* Fix Wav2Vec2

* Fix T5

* Fix Hubert

* Fix GPT2

* Fix Whisper

* Fix DeiT

* Fix the encoder-decoder / dual-encoder classes

* make fix-copies

* build in name scope

* Fix summarization test

* Fix tied weight names for BART + Blenderbot

* Fix tied weight name building

* Fix to TFESM weight building

* Update TF SAM

* Expand all the shapes out into Big Boy Shapes
parent 52c37882
...@@ -35,7 +35,6 @@ import tensorflow as tf ...@@ -35,7 +35,6 @@ import tensorflow as tf
from huggingface_hub import Repository, list_repo_files from huggingface_hub import Repository, list_repo_files
from keras import backend as K from keras import backend as K
from packaging.version import parse from packaging.version import parse
from tensorflow.python.util.keras_deps import get_call_context_function
from . import DataCollatorWithPadding, DefaultDataCollator from . import DataCollatorWithPadding, DefaultDataCollator
from .activations_tf import get_tf_activation from .activations_tf import get_tf_activation
...@@ -1122,6 +1121,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1122,6 +1121,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
) )
return dummies return dummies
def build_in_name_scope(self):
with tf.name_scope(self.name):
self.build(input_shape=None)
@property @property
def framework(self) -> str: def framework(self) -> str:
""" """
...@@ -1130,15 +1133,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1130,15 +1133,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return "tf" return "tf"
def build(self, input_shape=None): def build(self, input_shape=None):
call_context = get_call_context_function() pass # This is just here to make sure we don't call the superclass build()
if self.built or call_context().in_call:
self.built = True
else:
self.built = True
# Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
# Setting it in build() allows users to override the shape when loading a non-pretrained model from config
self._set_save_spec(self.input_signature)
self(self.dummy_inputs, training=False)
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs) super().__init__(*inputs, **kwargs)
...@@ -1869,7 +1864,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1869,7 +1864,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
main_layer.set_input_embeddings(value) main_layer.set_input_embeddings(value)
except AttributeError: except AttributeError:
logger.info("Building the model") logger.info("Building the model")
self.build() self.build_in_name_scope()
main_layer.set_input_embeddings(value) main_layer.set_input_embeddings(value)
def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]: def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
...@@ -1886,7 +1881,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1886,7 +1881,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return lm_head.get_output_embeddings() return lm_head.get_output_embeddings()
except AttributeError: except AttributeError:
logger.info("Building the model") logger.info("Building the model")
self.build() self.build_in_name_scope()
return lm_head().get_output_embeddings() return lm_head().get_output_embeddings()
...@@ -1906,7 +1901,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1906,7 +1901,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
lm_head.set_output_embeddings(value) lm_head.set_output_embeddings(value)
except AttributeError: except AttributeError:
logger.info("Building the model") logger.info("Building the model")
self.build() self.build_in_name_scope()
lm_head.set_output_embeddings(value) lm_head.set_output_embeddings(value)
def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
...@@ -1944,7 +1939,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1944,7 +1939,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
try: try:
return lm_head.get_bias() return lm_head.get_bias()
except AttributeError: except AttributeError:
self.build() self.build_in_name_scope()
return lm_head.get_bias() return lm_head.get_bias()
return None return None
...@@ -1962,7 +1957,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1962,7 +1957,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
try: try:
lm_head.set_bias(value) lm_head.set_bias(value)
except AttributeError: except AttributeError:
self.build() self.build_in_name_scope()
lm_head.set_bias(value) lm_head.set_bias(value)
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> tf.keras.layers.Layer:
...@@ -2049,7 +2044,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -2049,7 +2044,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
# The reason why the attributes don't exist might be # The reason why the attributes don't exist might be
# because the model is not built, so retry getting # because the model is not built, so retry getting
# the argument after building the model # the argument after building the model
model.build() model.build_in_name_scope()
embeds = getattr(embedding_layer, "weight", None) embeds = getattr(embedding_layer, "weight", None)
if embeds is not None: if embeds is not None:
...@@ -2914,9 +2909,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -2914,9 +2909,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
# we might need to extend the variable scope for composite models # we might need to extend the variable scope for composite models
if load_weight_prefix is not None: if load_weight_prefix is not None:
with tf.compat.v1.variable_scope(load_weight_prefix): with tf.compat.v1.variable_scope(load_weight_prefix):
model.build() # build the network with dummy inputs model.build_in_name_scope() # build the network with dummy inputs
else: else:
model.build() # build the network with dummy inputs model.build_in_name_scope() # build the network with dummy inputs
if safetensors_from_pt: if safetensors_from_pt:
from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model
...@@ -3215,6 +3210,9 @@ class TFConv1D(tf.keras.layers.Layer): ...@@ -3215,6 +3210,9 @@ class TFConv1D(tf.keras.layers.Layer):
self.initializer_range = initializer_range self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
if self.built:
return
self.built = True
self.weight = self.add_weight( self.weight = self.add_weight(
"weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
) )
...@@ -3398,6 +3396,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -3398,6 +3396,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
if self.has_last_dropout: if self.has_last_dropout:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
self.hidden_size = config.hidden_size
def call(self, inputs, cls_index=None, training=False): def call(self, inputs, cls_index=None, training=False):
if not isinstance(inputs, (dict, tuple, list)): if not isinstance(inputs, (dict, tuple, list)):
...@@ -3450,6 +3449,14 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -3450,6 +3449,14 @@ class TFSequenceSummary(tf.keras.layers.Layer):
return output return output
def build(self, input_shape):
if self.built:
return
self.built = True
if getattr(self, "summary", None) is not None:
with tf.name_scope("summary"):
self.summary.build(self.hidden_size)
def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal: def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal:
""" """
......
...@@ -146,7 +146,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -146,7 +146,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
...@@ -168,7 +168,12 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -168,7 +168,12 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call( def call(
...@@ -246,6 +251,7 @@ class TFAlbertAttention(tf.keras.layers.Layer): ...@@ -246,6 +251,7 @@ class TFAlbertAttention(tf.keras.layers.Layer):
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993 # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
...@@ -307,6 +313,26 @@ class TFAlbertAttention(tf.keras.layers.Layer): ...@@ -307,6 +313,26 @@ class TFAlbertAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFAlbertLayer(tf.keras.layers.Layer): class TFAlbertLayer(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -329,6 +355,7 @@ class TFAlbertLayer(tf.keras.layers.Layer): ...@@ -329,6 +355,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
epsilon=config.layer_norm_eps, name="full_layer_layer_norm" epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call( def call(
self, self,
...@@ -356,6 +383,23 @@ class TFAlbertLayer(tf.keras.layers.Layer): ...@@ -356,6 +383,23 @@ class TFAlbertLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build([None, None, self.config.hidden_size])
if getattr(self, "ffn_output", None) is not None:
with tf.name_scope(self.ffn_output.name):
self.ffn_output.build([None, None, self.config.intermediate_size])
if getattr(self, "full_layer_layer_norm", None) is not None:
with tf.name_scope(self.full_layer_layer_norm.name):
self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
class TFAlbertLayerGroup(tf.keras.layers.Layer): class TFAlbertLayerGroup(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -399,6 +443,15 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): ...@@ -399,6 +443,15 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None) return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert_layers", None) is not None:
for layer in self.albert_layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFAlbertTransformer(tf.keras.layers.Layer): class TFAlbertTransformer(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -416,6 +469,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -416,6 +469,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
self.albert_layer_groups = [ self.albert_layer_groups = [
TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups) TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups)
] ]
self.config = config
def call( def call(
self, self,
...@@ -457,6 +511,18 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -457,6 +511,18 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding_hidden_mapping_in", None) is not None:
with tf.name_scope(self.embedding_hidden_mapping_in.name):
self.embedding_hidden_mapping_in.build([None, None, self.config.embedding_size])
if getattr(self, "albert_layer_groups", None) is not None:
for layer in self.albert_layer_groups:
with tf.name_scope(layer.name):
layer.build(None)
class TFAlbertPreTrainedModel(TFPreTrainedModel): class TFAlbertPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -488,13 +554,21 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -488,13 +554,21 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
# an output-only bias for each token. # an output-only bias for each token.
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
self.decoder_bias = self.add_weight( self.decoder_bias = self.add_weight(
shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self.decoder return self.decoder
...@@ -650,6 +724,20 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -650,6 +724,20 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions, attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build([None, None, self.config.hidden_size])
@dataclass @dataclass
class TFAlbertForPreTrainingOutput(ModelOutput): class TFAlbertForPreTrainingOutput(ModelOutput):
...@@ -825,6 +913,14 @@ class TFAlbertModel(TFAlbertPreTrainedModel): ...@@ -825,6 +913,14 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -921,6 +1017,20 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): ...@@ -921,6 +1017,20 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
if getattr(self, "sop_classifier", None) is not None:
with tf.name_scope(self.sop_classifier.name):
self.sop_classifier.build(None)
class TFAlbertSOPHead(tf.keras.layers.Layer): class TFAlbertSOPHead(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -932,6 +1042,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer): ...@@ -932,6 +1042,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
) )
self.config = config
def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor: def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor:
dropout_pooled_output = self.dropout(inputs=pooled_output, training=training) dropout_pooled_output = self.dropout(inputs=pooled_output, training=training)
...@@ -939,6 +1050,14 @@ class TFAlbertSOPHead(tf.keras.layers.Layer): ...@@ -939,6 +1050,14 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
return logits return logits
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING) @add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss): class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
...@@ -1035,6 +1154,17 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -1035,6 +1154,17 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1058,6 +1188,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -1058,6 +1188,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1117,6 +1248,17 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -1117,6 +1248,17 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1145,6 +1287,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1145,6 +1287,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1200,6 +1343,17 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1200,6 +1343,17 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1221,6 +1375,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1221,6 +1375,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1295,6 +1450,17 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1295,6 +1450,17 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1316,6 +1482,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1316,6 +1482,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
...@@ -1394,3 +1561,14 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1394,3 +1561,14 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
...@@ -43,7 +43,6 @@ from ...modeling_tf_utils import ( ...@@ -43,7 +43,6 @@ from ...modeling_tf_utils import (
) )
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import ( from ...utils import (
ContextManagers,
add_code_sample_docstrings, add_code_sample_docstrings,
add_end_docstrings, add_end_docstrings,
add_start_docstrings, add_start_docstrings,
...@@ -296,6 +295,23 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -296,6 +295,23 @@ class TFBartAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFBartEncoderLayer(tf.keras.layers.Layer): class TFBartEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs): def __init__(self, config: BartConfig, **kwargs):
...@@ -311,6 +327,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer): ...@@ -311,6 +327,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, self,
...@@ -352,6 +369,26 @@ class TFBartEncoderLayer(tf.keras.layers.Layer): ...@@ -352,6 +369,26 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartDecoderLayer(tf.keras.layers.Layer): class TFBartDecoderLayer(tf.keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs): def __init__(self, config: BartConfig, **kwargs):
...@@ -380,6 +417,7 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -380,6 +417,7 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, self,
...@@ -461,6 +499,32 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -461,6 +499,32 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
present_key_value, present_key_value,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartClassificationHead(tf.keras.layers.Layer): class TFBartClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
...@@ -470,6 +534,8 @@ class TFBartClassificationHead(tf.keras.layers.Layer): ...@@ -470,6 +534,8 @@ class TFBartClassificationHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(inner_dim, name="dense") self.dense = tf.keras.layers.Dense(inner_dim, name="dense")
self.dropout = tf.keras.layers.Dropout(pooler_dropout) self.dropout = tf.keras.layers.Dropout(pooler_dropout)
self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj") self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj")
self.input_dim = inner_dim
self.inner_dim = inner_dim
def call(self, inputs): def call(self, inputs):
hidden_states = self.dropout(inputs) hidden_states = self.dropout(inputs)
...@@ -479,6 +545,17 @@ class TFBartClassificationHead(tf.keras.layers.Layer): ...@@ -479,6 +545,17 @@ class TFBartClassificationHead(tf.keras.layers.Layer):
hidden_states = self.out_proj(hidden_states) hidden_states = self.out_proj(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.input_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.inner_dim])
class TFBartPretrainedModel(TFPreTrainedModel): class TFBartPretrainedModel(TFPreTrainedModel):
config_class = BartConfig config_class = BartConfig
...@@ -686,6 +763,7 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -686,6 +763,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
) )
self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model
@unpack_inputs @unpack_inputs
def call( def call(
...@@ -745,16 +823,8 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -745,16 +823,8 @@ class TFBartEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None: if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope` inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape) embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos hidden_states = inputs_embeds + embed_pos
...@@ -809,6 +879,21 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -809,6 +879,21 @@ class TFBartEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.embed_dim])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFBartDecoder(tf.keras.layers.Layer): class TFBartDecoder(tf.keras.layers.Layer):
...@@ -938,16 +1023,8 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -938,16 +1023,8 @@ class TFBartDecoder(tf.keras.layers.Layer):
positions = self.embed_positions(input_shape, position_ids=position_ids) positions = self.embed_positions(input_shape, position_ids=position_ids)
if inputs_embeds is None: if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope` inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
hidden_states = inputs_embeds hidden_states = inputs_embeds
...@@ -1032,6 +1109,21 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -1032,6 +1109,21 @@ class TFBartDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns, cross_attentions=all_cross_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFBartMainLayer(tf.keras.layers.Layer): class TFBartMainLayer(tf.keras.layers.Layer):
...@@ -1149,6 +1241,22 @@ class TFBartMainLayer(tf.keras.layers.Layer): ...@@ -1149,6 +1241,22 @@ class TFBartMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions, encoder_attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings( @add_start_docstrings(
"The bare BART Model outputting raw hidden-states without any specific head on top.", "The bare BART Model outputting raw hidden-states without any specific head on top.",
...@@ -1237,6 +1345,14 @@ class TFBartModel(TFBartPretrainedModel): ...@@ -1237,6 +1345,14 @@ class TFBartModel(TFBartPretrainedModel):
encoder_attentions=enc_attns, encoder_attentions=enc_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(tf.keras.layers.Layer):
""" """
...@@ -1440,6 +1556,17 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode ...@@ -1440,6 +1556,17 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode
def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1567,3 +1694,14 @@ class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassific ...@@ -1567,3 +1694,14 @@ class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassific
encoder_hidden_states=enc_hs, encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns, encoder_attentions=enc_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "classification_head", None) is not None:
with tf.name_scope(self.classification_head.name):
self.classification_head.build(None)
...@@ -156,7 +156,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -156,7 +156,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
...@@ -178,7 +178,12 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -178,7 +178,12 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def call( def call(
self, self,
...@@ -248,6 +253,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -248,6 +253,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
...@@ -337,6 +343,20 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -337,6 +343,20 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,) outputs = outputs + (past_key_value,)
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFBertSelfOutput(tf.keras.layers.Layer): class TFBertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -347,6 +367,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer): ...@@ -347,6 +367,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -355,6 +376,17 @@ class TFBertSelfOutput(tf.keras.layers.Layer): ...@@ -355,6 +376,17 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertAttention(tf.keras.layers.Layer): class TFBertAttention(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -395,6 +427,17 @@ class TFBertAttention(tf.keras.layers.Layer): ...@@ -395,6 +427,17 @@ class TFBertAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFBertIntermediate(tf.keras.layers.Layer): class TFBertIntermediate(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -408,6 +451,7 @@ class TFBertIntermediate(tf.keras.layers.Layer): ...@@ -408,6 +451,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -415,6 +459,14 @@ class TFBertIntermediate(tf.keras.layers.Layer): ...@@ -415,6 +459,14 @@ class TFBertIntermediate(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFBertOutput(tf.keras.layers.Layer): class TFBertOutput(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -425,6 +477,7 @@ class TFBertOutput(tf.keras.layers.Layer): ...@@ -425,6 +477,7 @@ class TFBertOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -433,6 +486,17 @@ class TFBertOutput(tf.keras.layers.Layer): ...@@ -433,6 +486,17 @@ class TFBertOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLayer(tf.keras.layers.Layer): class TFBertLayer(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -519,6 +583,23 @@ class TFBertLayer(tf.keras.layers.Layer): ...@@ -519,6 +583,23 @@ class TFBertLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
class TFBertEncoder(tf.keras.layers.Layer): class TFBertEncoder(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -588,6 +669,15 @@ class TFBertEncoder(tf.keras.layers.Layer): ...@@ -588,6 +669,15 @@ class TFBertEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions, cross_attentions=all_cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFBertPooler(tf.keras.layers.Layer): class TFBertPooler(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -599,6 +689,7 @@ class TFBertPooler(tf.keras.layers.Layer): ...@@ -599,6 +689,7 @@ class TFBertPooler(tf.keras.layers.Layer):
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
...@@ -608,6 +699,14 @@ class TFBertPooler(tf.keras.layers.Layer): ...@@ -608,6 +699,14 @@ class TFBertPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFBertPredictionHeadTransform(tf.keras.layers.Layer): class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -625,6 +724,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -625,6 +724,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -633,6 +733,17 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -633,6 +733,17 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLMPredictionHead(tf.keras.layers.Layer): class TFBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
...@@ -647,10 +758,15 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -647,10 +758,15 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
# an output-only bias for each token. # an output-only bias for each token.
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self.input_embeddings return self.input_embeddings
...@@ -688,6 +804,14 @@ class TFBertMLMHead(tf.keras.layers.Layer): ...@@ -688,6 +804,14 @@ class TFBertMLMHead(tf.keras.layers.Layer):
return prediction_scores return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
class TFBertNSPHead(tf.keras.layers.Layer): class TFBertNSPHead(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -698,12 +822,21 @@ class TFBertNSPHead(tf.keras.layers.Layer): ...@@ -698,12 +822,21 @@ class TFBertNSPHead(tf.keras.layers.Layer):
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="seq_relationship", name="seq_relationship",
) )
self.config = config
def call(self, pooled_output: tf.Tensor) -> tf.Tensor: def call(self, pooled_output: tf.Tensor) -> tf.Tensor:
seq_relationship_score = self.seq_relationship(inputs=pooled_output) seq_relationship_score = self.seq_relationship(inputs=pooled_output)
return seq_relationship_score return seq_relationship_score
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "seq_relationship", None) is not None:
with tf.name_scope(self.seq_relationship.name):
self.seq_relationship.build([None, None, self.config.hidden_size])
@keras_serializable @keras_serializable
class TFBertMainLayer(tf.keras.layers.Layer): class TFBertMainLayer(tf.keras.layers.Layer):
...@@ -891,6 +1024,20 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -891,6 +1024,20 @@ class TFBertMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions, cross_attentions=encoder_outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFBertPreTrainedModel(TFPreTrainedModel): class TFBertPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -1103,6 +1250,14 @@ class TFBertModel(TFBertPreTrainedModel): ...@@ -1103,6 +1250,14 @@ class TFBertModel(TFBertPreTrainedModel):
) )
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1215,6 +1370,20 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): ...@@ -1215,6 +1370,20 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "nsp", None) is not None:
with tf.name_scope(self.nsp.name):
self.nsp.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
...@@ -1301,6 +1470,17 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -1301,6 +1470,17 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
...@@ -1426,6 +1606,17 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1426,6 +1606,17 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
cross_attentions=outputs.cross_attentions, cross_attentions=outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings( @add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top.""", """Bert Model with a `next sentence prediction (classification)` head on top.""",
...@@ -1508,6 +1699,17 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi ...@@ -1508,6 +1699,17 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "nsp", None) is not None:
with tf.name_scope(self.nsp.name):
self.nsp.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1536,6 +1738,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -1536,6 +1738,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1594,6 +1797,17 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -1594,6 +1797,17 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1615,6 +1829,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1615,6 +1829,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
...@@ -1693,6 +1908,17 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1693,6 +1908,17 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1727,6 +1953,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1727,6 +1953,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1783,6 +2010,17 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1783,6 +2010,17 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1812,6 +2050,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -1812,6 +2050,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs", name="qa_outputs",
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1884,3 +2123,14 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -1884,3 +2123,14 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
...@@ -41,7 +41,6 @@ from ...modeling_tf_utils import ( ...@@ -41,7 +41,6 @@ from ...modeling_tf_utils import (
) )
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import ( from ...utils import (
ContextManagers,
add_code_sample_docstrings, add_code_sample_docstrings,
add_end_docstrings, add_end_docstrings,
add_start_docstrings, add_start_docstrings,
...@@ -291,6 +290,23 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -291,6 +290,23 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot
class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
...@@ -307,6 +323,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): ...@@ -307,6 +323,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, self,
...@@ -348,6 +365,26 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): ...@@ -348,6 +365,26 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot
class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
...@@ -377,6 +414,7 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): ...@@ -377,6 +414,7 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, self,
...@@ -458,6 +496,32 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): ...@@ -458,6 +496,32 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
present_key_value, present_key_value,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotPreTrainedModel(TFPreTrainedModel): class TFBlenderbotPreTrainedModel(TFPreTrainedModel):
config_class = BlenderbotConfig config_class = BlenderbotConfig
...@@ -711,16 +775,8 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -711,16 +775,8 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None: if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope` inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape) embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos hidden_states = inputs_embeds + embed_pos
...@@ -776,6 +832,21 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -776,6 +832,21 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFBlenderbotDecoder(tf.keras.layers.Layer): class TFBlenderbotDecoder(tf.keras.layers.Layer):
...@@ -916,12 +987,8 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -916,12 +987,8 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
positions = self.embed_positions(input_shape, position_ids=position_ids) positions = self.embed_positions(input_shape, position_ids=position_ids)
if inputs_embeds is None: if inputs_embeds is None:
context = [] check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
if hasattr(self.embed_tokens, "load_weight_prefix"): inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
hidden_states = inputs_embeds hidden_states = inputs_embeds
...@@ -1006,6 +1073,21 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -1006,6 +1073,21 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns, cross_attentions=all_cross_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFBlenderbotMainLayer(tf.keras.layers.Layer): class TFBlenderbotMainLayer(tf.keras.layers.Layer):
...@@ -1114,6 +1196,22 @@ class TFBlenderbotMainLayer(tf.keras.layers.Layer): ...@@ -1114,6 +1196,22 @@ class TFBlenderbotMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions, encoder_attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings( @add_start_docstrings(
"The bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.", "The bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.",
...@@ -1217,6 +1315,14 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel): ...@@ -1217,6 +1315,14 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel):
encoder_attentions=enc_attns, encoder_attentions=enc_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(tf.keras.layers.Layer):
...@@ -1436,3 +1542,14 @@ class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausal ...@@ -1436,3 +1542,14 @@ class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausal
"cross_attn_head_mask": cross_attn_head_mask, "cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging) "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
} }
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
...@@ -40,7 +40,6 @@ from ...modeling_tf_utils import ( ...@@ -40,7 +40,6 @@ from ...modeling_tf_utils import (
) )
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import ( from ...utils import (
ContextManagers,
add_code_sample_docstrings, add_code_sample_docstrings,
add_end_docstrings, add_end_docstrings,
add_start_docstrings, add_start_docstrings,
...@@ -291,6 +290,23 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -291,6 +290,23 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
...@@ -307,6 +323,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): ...@@ -307,6 +323,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, self,
...@@ -348,6 +365,26 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): ...@@ -348,6 +365,26 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
...@@ -377,6 +414,7 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ...@@ -377,6 +414,7 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, self,
...@@ -458,6 +496,32 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ...@@ -458,6 +496,32 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
present_key_value, present_key_value,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel): class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel):
config_class = BlenderbotSmallConfig config_class = BlenderbotSmallConfig
...@@ -646,6 +710,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -646,6 +710,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
) )
self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -717,16 +782,8 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -717,16 +782,8 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None: if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope` inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape) embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos hidden_states = inputs_embeds + embed_pos
...@@ -781,6 +838,21 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -781,6 +838,21 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.embed_dim])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
...@@ -917,16 +989,8 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -917,16 +989,8 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0 past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
if inputs_embeds is None: if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope` inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
if input_shape[-1] > 1: if input_shape[-1] > 1:
...@@ -1014,6 +1078,21 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -1014,6 +1078,21 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns, cross_attentions=all_cross_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer): class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer):
...@@ -1122,6 +1201,22 @@ class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer): ...@@ -1122,6 +1201,22 @@ class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions, encoder_attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings( @add_start_docstrings(
"The bare BLENDERBOT_SMALL Model outputting raw hidden-states without any specific head on top.", "The bare BLENDERBOT_SMALL Model outputting raw hidden-states without any specific head on top.",
...@@ -1209,6 +1304,14 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel): ...@@ -1209,6 +1304,14 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel):
encoder_attentions=enc_attns, encoder_attentions=enc_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(tf.keras.layers.Layer):
...@@ -1413,3 +1516,14 @@ class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel ...@@ -1413,3 +1516,14 @@ class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel
"cross_attn_head_mask": cross_attn_head_mask, "cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging) "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
} }
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
...@@ -254,7 +254,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): ...@@ -254,7 +254,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1 self.num_positions = self.num_patches + 1
def build(self, input_shape): def build(self, input_shape=None):
self.class_embedding = self.add_weight( self.class_embedding = self.add_weight(
shape=(1, 1, self.embed_dim), shape=(1, 1, self.embed_dim),
initializer=get_initializer(self.config.initializer_range), initializer=get_initializer(self.config.initializer_range),
...@@ -268,7 +268,13 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): ...@@ -268,7 +268,13 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
trainable=True, trainable=True,
name="position_embedding", name="position_embedding",
) )
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "patch_embedding", None) is not None:
with tf.name_scope(self.patch_embedding.name):
self.patch_embedding.build([None, None, None, 3])
def call(self, pixel_values: tf.Tensor) -> tf.Tensor: def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
# Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch
...@@ -412,6 +418,20 @@ class TFBlipAttention(tf.keras.layers.Layer): ...@@ -412,6 +418,20 @@ class TFBlipAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "qkv", None) is not None:
with tf.name_scope(self.qkv.name):
self.qkv.build([None, None, self.embed_dim])
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.embed_dim])
class TFBlipMLP(tf.keras.layers.Layer): class TFBlipMLP(tf.keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs): def __init__(self, config: BlipConfig, **kwargs):
...@@ -428,6 +448,7 @@ class TFBlipMLP(tf.keras.layers.Layer): ...@@ -428,6 +448,7 @@ class TFBlipMLP(tf.keras.layers.Layer):
self.fc2 = tf.keras.layers.Dense( self.fc2 = tf.keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
) )
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.fc1(inputs=hidden_states) hidden_states = self.fc1(inputs=hidden_states)
...@@ -435,6 +456,17 @@ class TFBlipMLP(tf.keras.layers.Layer): ...@@ -435,6 +456,17 @@ class TFBlipMLP(tf.keras.layers.Layer):
hidden_states = self.fc2(inputs=hidden_states) hidden_states = self.fc2(inputs=hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.config.hidden_size])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.intermediate_size])
class TFBlipEncoderLayer(tf.keras.layers.Layer): class TFBlipEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs): def __init__(self, config: BlipConfig, **kwargs):
...@@ -485,6 +517,23 @@ class TFBlipEncoderLayer(tf.keras.layers.Layer): ...@@ -485,6 +517,23 @@ class TFBlipEncoderLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, self.embed_dim])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, self.embed_dim])
class TFBlipPreTrainedModel(TFPreTrainedModel): class TFBlipPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -645,6 +694,15 @@ class TFBlipEncoder(tf.keras.layers.Layer): ...@@ -645,6 +694,15 @@ class TFBlipEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFBlipVisionModel(TFBlipPreTrainedModel): class TFBlipVisionModel(TFBlipPreTrainedModel):
main_input_name = "pixel_values" main_input_name = "pixel_values"
...@@ -657,6 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): ...@@ -657,6 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings") self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
self.encoder = TFBlipEncoder(config, name="encoder") self.encoder = TFBlipEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
...@@ -724,6 +783,20 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): ...@@ -724,6 +783,20 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "post_layernorm", None) is not None:
with tf.name_scope(self.post_layernorm.name):
self.post_layernorm.build([None, None, self.embed_dim])
class TFBlipMainLayer(tf.keras.layers.Layer): class TFBlipMainLayer(tf.keras.layers.Layer):
config_class = BlipConfig config_class = BlipConfig
...@@ -775,7 +848,22 @@ class TFBlipMainLayer(tf.keras.layers.Layer): ...@@ -775,7 +848,22 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True, trainable=True,
) )
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection.name):
self.visual_projection.build([None, None, self.vision_embed_dim])
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection.name):
self.text_projection.build([None, None, self.text_embed_dim])
@unpack_inputs @unpack_inputs
def call( def call(
...@@ -995,6 +1083,14 @@ class TFBlipModel(TFBlipPreTrainedModel): ...@@ -995,6 +1083,14 @@ class TFBlipModel(TFBlipPreTrainedModel):
return image_features return image_features
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "blip", None) is not None:
with tf.name_scope(self.blip.name):
self.blip.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1168,6 +1264,17 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel): ...@@ -1168,6 +1264,17 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "text_decoder", None) is not None:
with tf.name_scope(self.text_decoder.name):
self.text_decoder.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1409,6 +1516,20 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel): ...@@ -1409,6 +1516,20 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "text_encoder", None) is not None:
with tf.name_scope(self.text_encoder.name):
self.text_encoder.build(None)
if getattr(self, "text_decoder", None) is not None:
with tf.name_scope(self.text_decoder.name):
self.text_decoder.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1457,6 +1578,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): ...@@ -1457,6 +1578,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
if not hasattr(config, "decoder_start_token_id") if not hasattr(config, "decoder_start_token_id")
else config.decoder_start_token_id else config.decoder_start_token_id
) )
self.config = config
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> tf.keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
...@@ -1558,3 +1680,23 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): ...@@ -1558,3 +1680,23 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
attentions=vision_outputs.attentions, attentions=vision_outputs.attentions,
question_embeds=question_embeds, question_embeds=question_embeds,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "text_encoder", None) is not None:
with tf.name_scope(self.text_encoder.name):
self.text_encoder.build(None)
if getattr(self, "vision_proj", None) is not None:
with tf.name_scope(self.vision_proj.name):
self.vision_proj.build([None, None, self.config.vision_config.hidden_size])
if getattr(self, "text_proj", None) is not None:
with tf.name_scope(self.text_proj.name):
self.text_proj.build([None, None, self.config.text_config.hidden_size])
if getattr(self, "itm_head", None) is not None:
with tf.name_scope(self.itm_head.name):
self.itm_head.build([None, None, self.config.text_config.hidden_size])
...@@ -127,6 +127,23 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): ...@@ -127,6 +127,23 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
embeddings = self.dropout(embeddings, training=training) embeddings = self.dropout(embeddings, training=training)
return embeddings return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "word_embeddings", None) is not None:
with tf.name_scope(self.word_embeddings.name):
self.word_embeddings.build(None)
if getattr(self, "position_embeddings", None) is not None:
with tf.name_scope(self.position_embeddings.name):
self.position_embeddings.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
class TFBlipTextSelfAttention(tf.keras.layers.Layer): class TFBlipTextSelfAttention(tf.keras.layers.Layer):
...@@ -160,6 +177,7 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): ...@@ -160,6 +177,7 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.distance_embedding = tf.keras.layers.Embedding( self.distance_embedding = tf.keras.layers.Embedding(
2 * config.max_position_embeddings - 1, self.attention_head_size 2 * config.max_position_embeddings - 1, self.attention_head_size
) )
self.is_cross_attention = is_cross_attention
def transpose_for_scores(self, x): def transpose_for_scores(self, x):
new_x_shape = tf.concat( new_x_shape = tf.concat(
...@@ -250,6 +268,28 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): ...@@ -250,6 +268,28 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,) outputs = outputs + (past_key_value,)
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if self.is_cross_attention:
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.encoder_hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.encoder_hidden_size])
else:
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFBlipTextSelfOutput(tf.keras.layers.Layer): class TFBlipTextSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
...@@ -260,6 +300,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer): ...@@ -260,6 +300,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -268,6 +309,17 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer): ...@@ -268,6 +309,17 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
class TFBlipTextAttention(tf.keras.layers.Layer): class TFBlipTextAttention(tf.keras.layers.Layer):
...@@ -302,6 +354,17 @@ class TFBlipTextAttention(tf.keras.layers.Layer): ...@@ -302,6 +354,17 @@ class TFBlipTextAttention(tf.keras.layers.Layer):
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
class TFBlipTextIntermediate(tf.keras.layers.Layer): class TFBlipTextIntermediate(tf.keras.layers.Layer):
...@@ -316,6 +379,7 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer): ...@@ -316,6 +379,7 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -323,6 +387,14 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer): ...@@ -323,6 +387,14 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFBlipTextOutput(tf.keras.layers.Layer): class TFBlipTextOutput(tf.keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
...@@ -333,6 +405,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer): ...@@ -333,6 +405,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -341,6 +414,17 @@ class TFBlipTextOutput(tf.keras.layers.Layer): ...@@ -341,6 +414,17 @@ class TFBlipTextOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLayer(tf.keras.layers.Layer): class TFBlipTextLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -400,6 +484,23 @@ class TFBlipTextLayer(tf.keras.layers.Layer): ...@@ -400,6 +484,23 @@ class TFBlipTextLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
@keras_serializable @keras_serializable
...@@ -481,6 +582,15 @@ class TFBlipTextEncoder(tf.keras.layers.Layer): ...@@ -481,6 +582,15 @@ class TFBlipTextEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions, cross_attentions=all_cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
class TFBlipTextPooler(tf.keras.layers.Layer): class TFBlipTextPooler(tf.keras.layers.Layer):
...@@ -493,6 +603,7 @@ class TFBlipTextPooler(tf.keras.layers.Layer): ...@@ -493,6 +603,7 @@ class TFBlipTextPooler(tf.keras.layers.Layer):
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
...@@ -502,6 +613,14 @@ class TFBlipTextPooler(tf.keras.layers.Layer): ...@@ -502,6 +613,14 @@ class TFBlipTextPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
...@@ -520,6 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -520,6 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -528,6 +648,17 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -528,6 +648,17 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -546,7 +677,16 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): ...@@ -546,7 +677,16 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
def build(self, input_shape=None): def build(self, input_shape=None):
self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build([None, None, self.config.hidden_size])
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
...@@ -563,6 +703,14 @@ class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer): ...@@ -563,6 +703,14 @@ class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer):
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(sequence_output)
return prediction_scores return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
class TFBlipTextPreTrainedModel(TFPreTrainedModel): class TFBlipTextPreTrainedModel(TFPreTrainedModel):
...@@ -802,6 +950,20 @@ class TFBlipTextModel(TFBlipTextPreTrainedModel): ...@@ -802,6 +950,20 @@ class TFBlipTextModel(TFBlipTextPreTrainedModel):
cross_attentions=encoder_outputs.cross_attentions, cross_attentions=encoder_outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
...@@ -942,3 +1104,14 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): ...@@ -942,3 +1104,14 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
for layer_past in past_key_values: for layer_past in past_key_values:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past return reordered_past
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "cls", None) is not None:
with tf.name_scope(self.cls.name):
self.cls.build(None)
...@@ -184,7 +184,7 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): ...@@ -184,7 +184,7 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
...@@ -206,7 +206,12 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer): ...@@ -206,7 +206,12 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
""" """
...@@ -279,6 +284,7 @@ class TFCamembertPooler(tf.keras.layers.Layer): ...@@ -279,6 +284,7 @@ class TFCamembertPooler(tf.keras.layers.Layer):
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
...@@ -288,6 +294,14 @@ class TFCamembertPooler(tf.keras.layers.Layer): ...@@ -288,6 +294,14 @@ class TFCamembertPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert
class TFCamembertSelfAttention(tf.keras.layers.Layer): class TFCamembertSelfAttention(tf.keras.layers.Layer):
...@@ -317,6 +331,7 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer): ...@@ -317,6 +331,7 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
...@@ -406,6 +421,20 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer): ...@@ -406,6 +421,20 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,) outputs = outputs + (past_key_value,)
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert
class TFCamembertSelfOutput(tf.keras.layers.Layer): class TFCamembertSelfOutput(tf.keras.layers.Layer):
...@@ -417,6 +446,7 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer): ...@@ -417,6 +446,7 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -425,6 +455,17 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer): ...@@ -425,6 +455,17 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert
class TFCamembertAttention(tf.keras.layers.Layer): class TFCamembertAttention(tf.keras.layers.Layer):
...@@ -466,6 +507,17 @@ class TFCamembertAttention(tf.keras.layers.Layer): ...@@ -466,6 +507,17 @@ class TFCamembertAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert
class TFCamembertIntermediate(tf.keras.layers.Layer): class TFCamembertIntermediate(tf.keras.layers.Layer):
...@@ -480,6 +532,7 @@ class TFCamembertIntermediate(tf.keras.layers.Layer): ...@@ -480,6 +532,7 @@ class TFCamembertIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -487,6 +540,14 @@ class TFCamembertIntermediate(tf.keras.layers.Layer): ...@@ -487,6 +540,14 @@ class TFCamembertIntermediate(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert
class TFCamembertOutput(tf.keras.layers.Layer): class TFCamembertOutput(tf.keras.layers.Layer):
...@@ -498,6 +559,7 @@ class TFCamembertOutput(tf.keras.layers.Layer): ...@@ -498,6 +559,7 @@ class TFCamembertOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -506,6 +568,17 @@ class TFCamembertOutput(tf.keras.layers.Layer): ...@@ -506,6 +568,17 @@ class TFCamembertOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert
class TFCamembertLayer(tf.keras.layers.Layer): class TFCamembertLayer(tf.keras.layers.Layer):
...@@ -593,6 +666,23 @@ class TFCamembertLayer(tf.keras.layers.Layer): ...@@ -593,6 +666,23 @@ class TFCamembertLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert
class TFCamembertEncoder(tf.keras.layers.Layer): class TFCamembertEncoder(tf.keras.layers.Layer):
...@@ -663,6 +753,15 @@ class TFCamembertEncoder(tf.keras.layers.Layer): ...@@ -663,6 +753,15 @@ class TFCamembertEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions, cross_attentions=all_cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert
...@@ -861,6 +960,20 @@ class TFCamembertMainLayer(tf.keras.layers.Layer): ...@@ -861,6 +960,20 @@ class TFCamembertMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions, cross_attentions=encoder_outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
class TFCamembertPreTrainedModel(TFPreTrainedModel): class TFCamembertPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -945,6 +1058,14 @@ class TFCamembertModel(TFCamembertPreTrainedModel): ...@@ -945,6 +1058,14 @@ class TFCamembertModel(TFCamembertPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert
class TFCamembertLMHead(tf.keras.layers.Layer): class TFCamembertLMHead(tf.keras.layers.Layer):
...@@ -965,10 +1086,18 @@ class TFCamembertLMHead(tf.keras.layers.Layer): ...@@ -965,10 +1086,18 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
# an output-only bias for each token. # an output-only bias for each token.
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self): def get_output_embeddings(self):
return self.decoder return self.decoder
...@@ -1080,6 +1209,17 @@ class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelin ...@@ -1080,6 +1209,17 @@ class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelin
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead
class TFCamembertClassificationHead(tf.keras.layers.Layer): class TFCamembertClassificationHead(tf.keras.layers.Layer):
...@@ -1100,6 +1240,7 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer): ...@@ -1100,6 +1240,7 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer):
self.out_proj = tf.keras.layers.Dense( self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
self.config = config
def call(self, features, training=False): def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = features[:, 0, :] # take <s> token (equiv. to [CLS])
...@@ -1109,6 +1250,17 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer): ...@@ -1109,6 +1250,17 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer):
x = self.out_proj(x) x = self.out_proj(x)
return x return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1186,6 +1338,17 @@ class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenc ...@@ -1186,6 +1338,17 @@ class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenc
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1212,6 +1375,7 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass ...@@ -1212,6 +1375,7 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1270,6 +1434,17 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass ...@@ -1270,6 +1434,17 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1292,6 +1467,7 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL ...@@ -1292,6 +1467,7 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
...@@ -1363,6 +1539,17 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL ...@@ -1363,6 +1539,17 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1384,6 +1571,7 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw ...@@ -1384,6 +1571,7 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1456,6 +1644,17 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw ...@@ -1456,6 +1644,17 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
...@@ -1581,3 +1780,14 @@ class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelin ...@@ -1581,3 +1780,14 @@ class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelin
attentions=outputs.attentions, attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions, cross_attentions=outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
...@@ -169,7 +169,12 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): ...@@ -169,7 +169,12 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
name="embeddings", name="embeddings",
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "patch_embedding", None) is not None:
with tf.name_scope(self.patch_embedding.name):
self.patch_embedding.build([None, None, None, self.config.num_channels])
def call(self, pixel_values: tf.Tensor) -> tf.Tensor: def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
"""`pixel_values` is expected to be of NCHW format.""" """`pixel_values` is expected to be of NCHW format."""
...@@ -352,6 +357,23 @@ class TFCLIPAttention(tf.keras.layers.Layer): ...@@ -352,6 +357,23 @@ class TFCLIPAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFCLIPMLP(tf.keras.layers.Layer): class TFCLIPMLP(tf.keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs): def __init__(self, config: CLIPConfig, **kwargs):
...@@ -369,6 +391,7 @@ class TFCLIPMLP(tf.keras.layers.Layer): ...@@ -369,6 +391,7 @@ class TFCLIPMLP(tf.keras.layers.Layer):
self.fc2 = tf.keras.layers.Dense( self.fc2 = tf.keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
) )
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.fc1(inputs=hidden_states) hidden_states = self.fc1(inputs=hidden_states)
...@@ -376,6 +399,17 @@ class TFCLIPMLP(tf.keras.layers.Layer): ...@@ -376,6 +399,17 @@ class TFCLIPMLP(tf.keras.layers.Layer):
hidden_states = self.fc2(inputs=hidden_states) hidden_states = self.fc2(inputs=hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.config.hidden_size])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.intermediate_size])
class TFCLIPEncoderLayer(tf.keras.layers.Layer): class TFCLIPEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs): def __init__(self, config: CLIPConfig, **kwargs):
...@@ -428,6 +462,23 @@ class TFCLIPEncoderLayer(tf.keras.layers.Layer): ...@@ -428,6 +462,23 @@ class TFCLIPEncoderLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, self.embed_dim])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, self.embed_dim])
class TFCLIPEncoder(tf.keras.layers.Layer): class TFCLIPEncoder(tf.keras.layers.Layer):
""" """
...@@ -483,6 +534,15 @@ class TFCLIPEncoder(tf.keras.layers.Layer): ...@@ -483,6 +534,15 @@ class TFCLIPEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFCLIPTextTransformer(tf.keras.layers.Layer): class TFCLIPTextTransformer(tf.keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs): def __init__(self, config: CLIPTextConfig, **kwargs):
...@@ -496,6 +556,7 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer): ...@@ -496,6 +556,7 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
# For `pooled_output` computation # For `pooled_output` computation
self.eos_token_id = config.eos_token_id self.eos_token_id = config.eos_token_id
self.embed_dim = config.hidden_size
def call( def call(
self, self,
...@@ -586,6 +647,20 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer): ...@@ -586,6 +647,20 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length)) return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
@keras_serializable @keras_serializable
class TFCLIPTextMainLayer(tf.keras.layers.Layer): class TFCLIPTextMainLayer(tf.keras.layers.Layer):
...@@ -634,6 +709,14 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer): ...@@ -634,6 +709,14 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer):
return text_model_outputs return text_model_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
class TFCLIPVisionTransformer(tf.keras.layers.Layer): class TFCLIPVisionTransformer(tf.keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs): def __init__(self, config: CLIPVisionConfig, **kwargs):
...@@ -643,6 +726,7 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer): ...@@ -643,6 +726,7 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer):
self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm") self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
self.encoder = TFCLIPEncoder(config, name="encoder") self.encoder = TFCLIPEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size
def call( def call(
self, self,
...@@ -679,6 +763,23 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer): ...@@ -679,6 +763,23 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions, attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "pre_layernorm", None) is not None:
with tf.name_scope(self.pre_layernorm.name):
self.pre_layernorm.build([None, None, self.embed_dim])
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "post_layernorm", None) is not None:
with tf.name_scope(self.post_layernorm.name):
self.post_layernorm.build([None, self.embed_dim])
@keras_serializable @keras_serializable
class TFCLIPVisionMainLayer(tf.keras.layers.Layer): class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
...@@ -714,6 +815,14 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer): ...@@ -714,6 +815,14 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
return vision_model_outputs return vision_model_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
@keras_serializable @keras_serializable
class TFCLIPMainLayer(tf.keras.layers.Layer): class TFCLIPMainLayer(tf.keras.layers.Layer):
...@@ -757,6 +866,8 @@ class TFCLIPMainLayer(tf.keras.layers.Layer): ...@@ -757,6 +866,8 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
use_bias=False, use_bias=False,
name="text_projection", name="text_projection",
) )
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
def build(self, input_shape: tf.TensorShape = None): def build(self, input_shape: tf.TensorShape = None):
self.logit_scale = self.add_weight( self.logit_scale = self.add_weight(
...@@ -766,7 +877,21 @@ class TFCLIPMainLayer(tf.keras.layers.Layer): ...@@ -766,7 +877,21 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
name="logit_scale", name="logit_scale",
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection.name):
self.visual_projection.build([None, None, self.vision_embed_dim])
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection.name):
self.text_projection.build([None, None, self.text_embed_dim])
@unpack_inputs @unpack_inputs
def get_text_features( def get_text_features(
...@@ -1108,6 +1233,14 @@ class TFCLIPTextModel(TFCLIPPreTrainedModel): ...@@ -1108,6 +1233,14 @@ class TFCLIPTextModel(TFCLIPPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
class TFCLIPVisionModel(TFCLIPPreTrainedModel): class TFCLIPVisionModel(TFCLIPPreTrainedModel):
config_class = CLIPVisionConfig config_class = CLIPVisionConfig
...@@ -1162,6 +1295,14 @@ class TFCLIPVisionModel(TFCLIPPreTrainedModel): ...@@ -1162,6 +1295,14 @@ class TFCLIPVisionModel(TFCLIPPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
@add_start_docstrings(CLIP_START_DOCSTRING) @add_start_docstrings(CLIP_START_DOCSTRING)
class TFCLIPModel(TFCLIPPreTrainedModel): class TFCLIPModel(TFCLIPPreTrainedModel):
...@@ -1313,3 +1454,11 @@ class TFCLIPModel(TFCLIPPreTrainedModel): ...@@ -1313,3 +1454,11 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
# TensorFlow cannot trace through nested dataclasses. Reference: # TensorFlow cannot trace through nested dataclasses. Reference:
# https://github.com/huggingface/transformers/pull/16886 # https://github.com/huggingface/transformers/pull/16886
return output return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
...@@ -81,7 +81,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ...@@ -81,7 +81,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
...@@ -103,7 +103,12 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ...@@ -103,7 +103,12 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call( def call(
...@@ -208,6 +213,7 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): ...@@ -208,6 +213,7 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
) )
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, x, batch_size): def transpose_for_scores(self, x, batch_size):
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
...@@ -297,6 +303,29 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): ...@@ -297,6 +303,29 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
if getattr(self, "key_conv_attn_layer", None) is not None:
with tf.name_scope(self.key_conv_attn_layer.name):
self.key_conv_attn_layer.build([None, None, self.config.hidden_size])
if getattr(self, "conv_kernel_layer", None) is not None:
with tf.name_scope(self.conv_kernel_layer.name):
self.conv_kernel_layer.build([None, None, self.all_head_size])
if getattr(self, "conv_out_layer", None) is not None:
with tf.name_scope(self.conv_out_layer.name):
self.conv_out_layer.build([None, None, self.config.hidden_size])
class TFConvBertSelfOutput(tf.keras.layers.Layer): class TFConvBertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -307,6 +336,7 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer): ...@@ -307,6 +336,7 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -315,6 +345,17 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer): ...@@ -315,6 +345,17 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFConvBertAttention(tf.keras.layers.Layer): class TFConvBertAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -335,6 +376,17 @@ class TFConvBertAttention(tf.keras.layers.Layer): ...@@ -335,6 +376,17 @@ class TFConvBertAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class GroupedLinearLayer(tf.keras.layers.Layer): class GroupedLinearLayer(tf.keras.layers.Layer):
def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs): def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs):
...@@ -389,6 +441,7 @@ class TFConvBertIntermediate(tf.keras.layers.Layer): ...@@ -389,6 +441,7 @@ class TFConvBertIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -396,6 +449,14 @@ class TFConvBertIntermediate(tf.keras.layers.Layer): ...@@ -396,6 +449,14 @@ class TFConvBertIntermediate(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFConvBertOutput(tf.keras.layers.Layer): class TFConvBertOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -415,6 +476,7 @@ class TFConvBertOutput(tf.keras.layers.Layer): ...@@ -415,6 +476,7 @@ class TFConvBertOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False): def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -423,6 +485,17 @@ class TFConvBertOutput(tf.keras.layers.Layer): ...@@ -423,6 +485,17 @@ class TFConvBertOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
class TFConvBertLayer(tf.keras.layers.Layer): class TFConvBertLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -443,6 +516,20 @@ class TFConvBertLayer(tf.keras.layers.Layer): ...@@ -443,6 +516,20 @@ class TFConvBertLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
class TFConvBertEncoder(tf.keras.layers.Layer): class TFConvBertEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -486,6 +573,15 @@ class TFConvBertEncoder(tf.keras.layers.Layer): ...@@ -486,6 +573,15 @@ class TFConvBertEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -501,6 +597,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -501,6 +597,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -509,6 +606,17 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -509,6 +606,17 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
@keras_serializable @keras_serializable
class TFConvBertMainLayer(tf.keras.layers.Layer): class TFConvBertMainLayer(tf.keras.layers.Layer):
...@@ -616,6 +724,20 @@ class TFConvBertMainLayer(tf.keras.layers.Layer): ...@@ -616,6 +724,20 @@ class TFConvBertMainLayer(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "embeddings_project", None) is not None:
with tf.name_scope(self.embeddings_project.name):
self.embeddings_project.build([None, None, self.config.embedding_size])
class TFConvBertPreTrainedModel(TFPreTrainedModel): class TFConvBertPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -770,6 +892,14 @@ class TFConvBertModel(TFConvBertPreTrainedModel): ...@@ -770,6 +892,14 @@ class TFConvBertModel(TFConvBertPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
class TFConvBertMaskedLMHead(tf.keras.layers.Layer): class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
...@@ -814,6 +944,7 @@ class TFConvBertGeneratorPredictions(tf.keras.layers.Layer): ...@@ -814,6 +944,7 @@ class TFConvBertGeneratorPredictions(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense")
self.config = config
def call(self, generator_hidden_states, training=False): def call(self, generator_hidden_states, training=False):
hidden_states = self.dense(generator_hidden_states) hidden_states = self.dense(generator_hidden_states)
...@@ -822,6 +953,17 @@ class TFConvBertGeneratorPredictions(tf.keras.layers.Layer): ...@@ -822,6 +953,17 @@ class TFConvBertGeneratorPredictions(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING) @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingLoss): class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingLoss):
...@@ -901,6 +1043,20 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL ...@@ -901,6 +1043,20 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
attentions=generator_hidden_states.attentions, attentions=generator_hidden_states.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "generator_predictions", None) is not None:
with tf.name_scope(self.generator_predictions.name):
self.generator_predictions.build(None)
if getattr(self, "generator_lm_head", None) is not None:
with tf.name_scope(self.generator_lm_head.name):
self.generator_lm_head.build(None)
class TFConvBertClassificationHead(tf.keras.layers.Layer): class TFConvBertClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
...@@ -931,6 +1087,17 @@ class TFConvBertClassificationHead(tf.keras.layers.Layer): ...@@ -931,6 +1087,17 @@ class TFConvBertClassificationHead(tf.keras.layers.Layer):
return x return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -999,6 +1166,17 @@ class TFConvBertForSequenceClassification(TFConvBertPreTrainedModel, TFSequenceC ...@@ -999,6 +1166,17 @@ class TFConvBertForSequenceClassification(TFConvBertPreTrainedModel, TFSequenceC
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1018,6 +1196,7 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos ...@@ -1018,6 +1196,7 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
...@@ -1092,6 +1271,20 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos ...@@ -1092,6 +1271,20 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1113,6 +1306,7 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif ...@@ -1113,6 +1306,7 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1167,6 +1361,17 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif ...@@ -1167,6 +1361,17 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1184,6 +1389,7 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer ...@@ -1184,6 +1389,7 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1252,3 +1458,14 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer ...@@ -1252,3 +1458,14 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
...@@ -81,6 +81,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): ...@@ -81,6 +81,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
) )
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels self.num_channels = config.num_channels
self.config = config
def call(self, pixel_values): def call(self, pixel_values):
if isinstance(pixel_values, dict): if isinstance(pixel_values, dict):
...@@ -101,6 +102,17 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer): ...@@ -101,6 +102,17 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
embeddings = self.layernorm(embeddings) embeddings = self.layernorm(embeddings)
return embeddings return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build([None, None, None, self.config.num_channels])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextLayer(tf.keras.layers.Layer): class TFConvNextLayer(tf.keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation. """This corresponds to the `Block` class in the original implementation.
...@@ -167,7 +179,25 @@ class TFConvNextLayer(tf.keras.layers.Layer): ...@@ -167,7 +179,25 @@ class TFConvNextLayer(tf.keras.layers.Layer):
if self.config.layer_scale_init_value > 0 if self.config.layer_scale_init_value > 0
else None else None
) )
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dwconv", None) is not None:
with tf.name_scope(self.dwconv.name):
self.dwconv.build([None, None, None, self.dim])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.dim])
if getattr(self, "pwconv1", None) is not None:
with tf.name_scope(self.pwconv1.name):
self.pwconv1.build([None, None, self.dim])
if getattr(self, "pwconv2", None) is not None:
with tf.name_scope(self.pwconv2.name):
self.pwconv2.build([None, None, 4 * self.dim])
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
def call(self, hidden_states, training=False): def call(self, hidden_states, training=False):
input = hidden_states input = hidden_states
...@@ -245,6 +275,9 @@ class TFConvNextStage(tf.keras.layers.Layer): ...@@ -245,6 +275,9 @@ class TFConvNextStage(tf.keras.layers.Layer):
) )
for j in range(depth) for j in range(depth)
] ]
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
def call(self, hidden_states): def call(self, hidden_states):
for layer in self.downsampling_layer: for layer in self.downsampling_layer:
...@@ -253,6 +286,20 @@ class TFConvNextStage(tf.keras.layers.Layer): ...@@ -253,6 +286,20 @@ class TFConvNextStage(tf.keras.layers.Layer):
hidden_states = layer(hidden_states) hidden_states = layer(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
if self.in_channels != self.out_channels or self.stride > 1:
with tf.name_scope(self.downsampling_layer[0].name):
self.downsampling_layer[0].build([None, None, None, self.in_channels])
with tf.name_scope(self.downsampling_layer[1].name):
self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextEncoder(tf.keras.layers.Layer): class TFConvNextEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -293,6 +340,11 @@ class TFConvNextEncoder(tf.keras.layers.Layer): ...@@ -293,6 +340,11 @@ class TFConvNextEncoder(tf.keras.layers.Layer):
return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states) return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
def build(self, input_shape=None):
for stage in self.stages:
with tf.name_scope(stage.name):
stage.build(None)
@keras_serializable @keras_serializable
class TFConvNextMainLayer(tf.keras.layers.Layer): class TFConvNextMainLayer(tf.keras.layers.Layer):
...@@ -353,6 +405,20 @@ class TFConvNextMainLayer(tf.keras.layers.Layer): ...@@ -353,6 +405,20 @@ class TFConvNextMainLayer(tf.keras.layers.Layer):
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, self.config.hidden_sizes[-1]])
class TFConvNextPreTrainedModel(TFPreTrainedModel): class TFConvNextPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -485,6 +551,14 @@ class TFConvNextModel(TFConvNextPreTrainedModel): ...@@ -485,6 +551,14 @@ class TFConvNextModel(TFConvNextPreTrainedModel):
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnext", None) is not None:
with tf.name_scope(self.convnext.name):
self.convnext.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -507,6 +581,7 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas ...@@ -507,6 +581,7 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas
bias_initializer="zeros", bias_initializer="zeros",
name="classifier", name="classifier",
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
...@@ -577,3 +652,15 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas ...@@ -577,3 +652,15 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas
logits=logits, logits=logits,
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnext", None) is not None:
with tf.name_scope(self.convnext.name):
self.convnext.build(None)
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
...@@ -133,6 +133,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer): ...@@ -133,6 +133,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
) )
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels self.num_channels = config.num_channels
self.config = config
def call(self, pixel_values): def call(self, pixel_values):
if isinstance(pixel_values, dict): if isinstance(pixel_values, dict):
...@@ -153,6 +154,17 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer): ...@@ -153,6 +154,17 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
embeddings = self.layernorm(embeddings) embeddings = self.layernorm(embeddings)
return embeddings return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build([None, None, None, self.config.num_channels])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextV2Layer(tf.keras.layers.Layer): class TFConvNextV2Layer(tf.keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation. """This corresponds to the `Block` class in the original implementation.
...@@ -223,6 +235,29 @@ class TFConvNextV2Layer(tf.keras.layers.Layer): ...@@ -223,6 +235,29 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
x = input + x x = input + x
return x return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dwconv", None) is not None:
with tf.name_scope(self.dwconv.name):
self.dwconv.build([None, None, None, self.dim])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.dim])
if getattr(self, "pwconv1", None) is not None:
with tf.name_scope(self.pwconv1.name):
self.pwconv1.build([None, None, self.dim])
if getattr(self, "grn", None) is not None:
with tf.name_scope(self.grn.name):
self.grn.build(None)
if getattr(self, "pwconv2", None) is not None:
with tf.name_scope(self.pwconv2.name):
self.pwconv2.build([None, None, 4 * self.dim])
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2 # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2
class TFConvNextV2Stage(tf.keras.layers.Layer): class TFConvNextV2Stage(tf.keras.layers.Layer):
...@@ -286,6 +321,9 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): ...@@ -286,6 +321,9 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
) )
for j in range(depth) for j in range(depth)
] ]
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
def call(self, hidden_states): def call(self, hidden_states):
for layer in self.downsampling_layer: for layer in self.downsampling_layer:
...@@ -294,6 +332,20 @@ class TFConvNextV2Stage(tf.keras.layers.Layer): ...@@ -294,6 +332,20 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
hidden_states = layer(hidden_states) hidden_states = layer(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
if self.in_channels != self.out_channels or self.stride > 1:
with tf.name_scope(self.downsampling_layer[0].name):
self.downsampling_layer[0].build([None, None, None, self.in_channels])
with tf.name_scope(self.downsampling_layer[1].name):
self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextV2Encoder(tf.keras.layers.Layer): class TFConvNextV2Encoder(tf.keras.layers.Layer):
def __init__(self, config: ConvNextV2Config, **kwargs): def __init__(self, config: ConvNextV2Config, **kwargs):
...@@ -339,6 +391,11 @@ class TFConvNextV2Encoder(tf.keras.layers.Layer): ...@@ -339,6 +391,11 @@ class TFConvNextV2Encoder(tf.keras.layers.Layer):
return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states) return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
def build(self, input_shape=None):
for stage in self.stages:
with tf.name_scope(stage.name):
stage.build(None)
@keras_serializable @keras_serializable
class TFConvNextV2MainLayer(tf.keras.layers.Layer): class TFConvNextV2MainLayer(tf.keras.layers.Layer):
...@@ -401,6 +458,20 @@ class TFConvNextV2MainLayer(tf.keras.layers.Layer): ...@@ -401,6 +458,20 @@ class TFConvNextV2MainLayer(tf.keras.layers.Layer):
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, self.config.hidden_sizes[-1]])
class TFConvNextV2PreTrainedModel(TFPreTrainedModel): class TFConvNextV2PreTrainedModel(TFPreTrainedModel):
""" """
...@@ -519,6 +590,14 @@ class TFConvNextV2Model(TFConvNextV2PreTrainedModel): ...@@ -519,6 +590,14 @@ class TFConvNextV2Model(TFConvNextV2PreTrainedModel):
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnextv2", None) is not None:
with tf.name_scope(self.convnextv2.name):
self.convnextv2.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -593,3 +672,14 @@ class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequence ...@@ -593,3 +672,14 @@ class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequence
logits=logits, logits=logits,
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnextv2", None) is not None:
with tf.name_scope(self.convnextv2.name):
self.convnextv2.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
...@@ -142,6 +142,23 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): ...@@ -142,6 +142,23 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "Wq", None) is not None:
with tf.name_scope(self.Wq.name):
self.Wq.build([None, None, self.d_model_size])
if getattr(self, "Wk", None) is not None:
with tf.name_scope(self.Wk.name):
self.Wk.build([None, None, self.d_model_size])
if getattr(self, "Wv", None) is not None:
with tf.name_scope(self.Wv.name):
self.Wv.build([None, None, self.d_model_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.d_model_size])
class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
def __init__(self, d_model_size, dff, **kwargs): def __init__(self, d_model_size, dff, **kwargs):
...@@ -149,6 +166,8 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): ...@@ -149,6 +166,8 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
self.dense_0 = tf.keras.layers.Dense(dff, activation="relu", name="0") self.dense_0 = tf.keras.layers.Dense(dff, activation="relu", name="0")
self.dense_2 = tf.keras.layers.Dense(d_model_size, name="2") self.dense_2 = tf.keras.layers.Dense(d_model_size, name="2")
self.d_model_size = d_model_size
self.dff = dff
def call(self, inputs, trainable=False): def call(self, inputs, trainable=False):
dense_0_output = self.dense_0(inputs) dense_0_output = self.dense_0(inputs)
...@@ -156,6 +175,17 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): ...@@ -156,6 +175,17 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
return dense_2_output return dense_2_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense_0", None) is not None:
with tf.name_scope(self.dense_0.name):
self.dense_0.build([None, None, self.d_model_size])
if getattr(self, "dense_2", None) is not None:
with tf.name_scope(self.dense_2.name):
self.dense_2.build([None, None, self.dff])
class TFEncoderLayer(tf.keras.layers.Layer): class TFEncoderLayer(tf.keras.layers.Layer):
def __init__( def __init__(
...@@ -175,6 +205,7 @@ class TFEncoderLayer(tf.keras.layers.Layer): ...@@ -175,6 +205,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate)
self.d_model_size = d_model_size
def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
normed = self.layernorm1(x) normed = self.layernorm1(x)
...@@ -202,6 +233,23 @@ class TFEncoderLayer(tf.keras.layers.Layer): ...@@ -202,6 +233,23 @@ class TFEncoderLayer(tf.keras.layers.Layer):
outputs = (out2,) + attn_outputs[1:] outputs = (out2,) + attn_outputs[1:]
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "multi_head_attention", None) is not None:
with tf.name_scope(self.multi_head_attention.name):
self.multi_head_attention.build(None)
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build(None)
if getattr(self, "layernorm1", None) is not None:
with tf.name_scope(self.layernorm1.name):
self.layernorm1.build([None, None, self.d_model_size])
if getattr(self, "layernorm2", None) is not None:
with tf.name_scope(self.layernorm2.name):
self.layernorm2.build([None, None, self.d_model_size])
@keras_serializable @keras_serializable
class TFCTRLMainLayer(tf.keras.layers.Layer): class TFCTRLMainLayer(tf.keras.layers.Layer):
...@@ -396,6 +444,21 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -396,6 +444,21 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
attentions=all_attentions, attentions=all_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "w", None) is not None:
with tf.name_scope(self.w.name):
self.w.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.n_embd])
if getattr(self, "h", None) is not None:
for layer in self.h:
with tf.name_scope(layer.name):
layer.build(None)
class TFCTRLPreTrainedModel(TFPreTrainedModel): class TFCTRLPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -563,6 +626,14 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -563,6 +626,14 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
) )
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFCTRLBiasLayer(tf.keras.layers.Layer): class TFCTRLBiasLayer(tf.keras.layers.Layer):
""" """
...@@ -710,6 +781,17 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -710,6 +781,17 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -737,6 +819,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific ...@@ -737,6 +819,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
use_bias=False, use_bias=False,
) )
self.transformer = TFCTRLMainLayer(config, name="transformer") self.transformer = TFCTRLMainLayer(config, name="transformer")
self.config = config
def get_output_embeddings(self): def get_output_embeddings(self):
# Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too. # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
...@@ -836,3 +919,14 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific ...@@ -836,3 +919,14 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
hidden_states=transformer_outputs.hidden_states, hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.n_embd])
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
...@@ -107,6 +107,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer): ...@@ -107,6 +107,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
self, self,
config: CvtConfig, config: CvtConfig,
patch_size: int, patch_size: int,
num_channels: int,
embed_dim: int, embed_dim: int,
stride: int, stride: int,
padding: int, padding: int,
...@@ -117,6 +118,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer): ...@@ -117,6 +118,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
self.convolution_embeddings = TFCvtConvEmbeddings( self.convolution_embeddings = TFCvtConvEmbeddings(
config, config,
patch_size=patch_size, patch_size=patch_size,
num_channels=num_channels,
embed_dim=embed_dim, embed_dim=embed_dim,
stride=stride, stride=stride,
padding=padding, padding=padding,
...@@ -129,11 +131,28 @@ class TFCvtEmbeddings(tf.keras.layers.Layer): ...@@ -129,11 +131,28 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
hidden_state = self.dropout(hidden_state, training=training) hidden_state = self.dropout(hidden_state, training=training)
return hidden_state return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_embeddings", None) is not None:
with tf.name_scope(self.convolution_embeddings.name):
self.convolution_embeddings.build(None)
class TFCvtConvEmbeddings(tf.keras.layers.Layer): class TFCvtConvEmbeddings(tf.keras.layers.Layer):
"""Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.""" """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
def __init__(self, config: CvtConfig, patch_size: int, embed_dim: int, stride: int, padding: int, **kwargs): def __init__(
self,
config: CvtConfig,
patch_size: int,
num_channels: int,
embed_dim: int,
stride: int,
padding: int,
**kwargs,
):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) self.padding = tf.keras.layers.ZeroPadding2D(padding=padding)
self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
...@@ -148,6 +167,8 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): ...@@ -148,6 +167,8 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
) )
# Using the same default epsilon as PyTorch # Using the same default epsilon as PyTorch
self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
self.num_channels = num_channels
self.embed_dim = embed_dim
def call(self, pixel_values: tf.Tensor) -> tf.Tensor: def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
if isinstance(pixel_values, dict): if isinstance(pixel_values, dict):
...@@ -165,6 +186,17 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer): ...@@ -165,6 +186,17 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels)) pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
return pixel_values return pixel_values
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, self.embed_dim])
class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
"""Convolutional projection layer.""" """Convolutional projection layer."""
...@@ -184,12 +216,24 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): ...@@ -184,12 +216,24 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
) )
# Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum) # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution(self.padding(hidden_state)) hidden_state = self.convolution(self.padding(hidden_state))
hidden_state = self.normalization(hidden_state, training=training) hidden_state = self.normalization(hidden_state, training=training)
return hidden_state return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution", None) is not None:
with tf.name_scope(self.convolution.name):
self.convolution.build([None, None, None, self.embed_dim])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.embed_dim])
class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer):
"""Linear projection layer used to flatten tokens into 1D.""" """Linear projection layer used to flatten tokens into 1D."""
...@@ -227,6 +271,14 @@ class TFCvtSelfAttentionProjection(tf.keras.layers.Layer): ...@@ -227,6 +271,14 @@ class TFCvtSelfAttentionProjection(tf.keras.layers.Layer):
hidden_state = self.linear_projection(hidden_state) hidden_state = self.linear_projection(hidden_state)
return hidden_state return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_projection", None) is not None:
with tf.name_scope(self.convolution_projection.name):
self.convolution_projection.build(None)
class TFCvtSelfAttention(tf.keras.layers.Layer): class TFCvtSelfAttention(tf.keras.layers.Layer):
""" """
...@@ -348,6 +400,29 @@ class TFCvtSelfAttention(tf.keras.layers.Layer): ...@@ -348,6 +400,29 @@ class TFCvtSelfAttention(tf.keras.layers.Layer):
context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim)) context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
return context return context
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_projection_query", None) is not None:
with tf.name_scope(self.convolution_projection_query.name):
self.convolution_projection_query.build(None)
if getattr(self, "convolution_projection_key", None) is not None:
with tf.name_scope(self.convolution_projection_key.name):
self.convolution_projection_key.build(None)
if getattr(self, "convolution_projection_value", None) is not None:
with tf.name_scope(self.convolution_projection_value.name):
self.convolution_projection_value.build(None)
if getattr(self, "projection_query", None) is not None:
with tf.name_scope(self.projection_query.name):
self.projection_query.build([None, None, self.embed_dim])
if getattr(self, "projection_key", None) is not None:
with tf.name_scope(self.projection_key.name):
self.projection_key.build([None, None, self.embed_dim])
if getattr(self, "projection_value", None) is not None:
with tf.name_scope(self.projection_value.name):
self.projection_value.build([None, None, self.embed_dim])
class TFCvtSelfOutput(tf.keras.layers.Layer): class TFCvtSelfOutput(tf.keras.layers.Layer):
"""Output of the Attention layer .""" """Output of the Attention layer ."""
...@@ -358,12 +433,21 @@ class TFCvtSelfOutput(tf.keras.layers.Layer): ...@@ -358,12 +433,21 @@ class TFCvtSelfOutput(tf.keras.layers.Layer):
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(drop_rate) self.dropout = tf.keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.dense(inputs=hidden_state) hidden_state = self.dense(inputs=hidden_state)
hidden_state = self.dropout(inputs=hidden_state, training=training) hidden_state = self.dropout(inputs=hidden_state, training=training)
return hidden_state return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.embed_dim])
class TFCvtAttention(tf.keras.layers.Layer): class TFCvtAttention(tf.keras.layers.Layer):
"""Attention layer. First chunk of the convolutional transformer block.""" """Attention layer. First chunk of the convolutional transformer block."""
...@@ -411,6 +495,17 @@ class TFCvtAttention(tf.keras.layers.Layer): ...@@ -411,6 +495,17 @@ class TFCvtAttention(tf.keras.layers.Layer):
attention_output = self.dense_output(self_output, training=training) attention_output = self.dense_output(self_output, training=training)
return attention_output return attention_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFCvtIntermediate(tf.keras.layers.Layer): class TFCvtIntermediate(tf.keras.layers.Layer):
"""Intermediate dense layer. Second chunk of the convolutional transformer block.""" """Intermediate dense layer. Second chunk of the convolutional transformer block."""
...@@ -423,23 +518,34 @@ class TFCvtIntermediate(tf.keras.layers.Layer): ...@@ -423,23 +518,34 @@ class TFCvtIntermediate(tf.keras.layers.Layer):
activation="gelu", activation="gelu",
name="dense", name="dense",
) )
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor) -> tf.Tensor: def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
hidden_state = self.dense(hidden_state) hidden_state = self.dense(hidden_state)
return hidden_state return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.embed_dim])
class TFCvtOutput(tf.keras.layers.Layer): class TFCvtOutput(tf.keras.layers.Layer):
""" """
Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection. Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
""" """
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: int, **kwargs): def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(drop_rate) self.dropout = tf.keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim
self.mlp_ratio = mlp_ratio
def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.dense(inputs=hidden_state) hidden_state = self.dense(inputs=hidden_state)
...@@ -447,6 +553,14 @@ class TFCvtOutput(tf.keras.layers.Layer): ...@@ -447,6 +553,14 @@ class TFCvtOutput(tf.keras.layers.Layer):
hidden_state = hidden_state + input_tensor hidden_state = hidden_state + input_tensor
return hidden_state return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)])
class TFCvtLayer(tf.keras.layers.Layer): class TFCvtLayer(tf.keras.layers.Layer):
""" """
...@@ -492,7 +606,7 @@ class TFCvtLayer(tf.keras.layers.Layer): ...@@ -492,7 +606,7 @@ class TFCvtLayer(tf.keras.layers.Layer):
name="attention", name="attention",
) )
self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate") self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
self.dense_output = TFCvtOutput(config, embed_dim, drop_rate, name="output") self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output")
# Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour. # Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour.
self.drop_path = ( self.drop_path = (
TFCvtDropPath(drop_path_rate, name="drop_path") TFCvtDropPath(drop_path_rate, name="drop_path")
...@@ -502,6 +616,7 @@ class TFCvtLayer(tf.keras.layers.Layer): ...@@ -502,6 +616,7 @@ class TFCvtLayer(tf.keras.layers.Layer):
# Using the same default epsilon as PyTorch # Using the same default epsilon as PyTorch
self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
# in Cvt, layernorm is applied before self-attention # in Cvt, layernorm is applied before self-attention
...@@ -520,6 +635,29 @@ class TFCvtLayer(tf.keras.layers.Layer): ...@@ -520,6 +635,29 @@ class TFCvtLayer(tf.keras.layers.Layer):
layer_output = self.drop_path(layer_output, training=training) layer_output = self.drop_path(layer_output, training=training)
return layer_output return layer_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.embed_dim])
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.embed_dim])
class TFCvtStage(tf.keras.layers.Layer): class TFCvtStage(tf.keras.layers.Layer):
""" """
...@@ -548,6 +686,7 @@ class TFCvtStage(tf.keras.layers.Layer): ...@@ -548,6 +686,7 @@ class TFCvtStage(tf.keras.layers.Layer):
self.embedding = TFCvtEmbeddings( self.embedding = TFCvtEmbeddings(
self.config, self.config,
patch_size=config.patch_sizes[self.stage], patch_size=config.patch_sizes[self.stage],
num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
stride=config.patch_stride[self.stage], stride=config.patch_stride[self.stage],
embed_dim=config.embed_dim[self.stage], embed_dim=config.embed_dim[self.stage],
padding=config.patch_padding[self.stage], padding=config.patch_padding[self.stage],
...@@ -603,6 +742,18 @@ class TFCvtStage(tf.keras.layers.Layer): ...@@ -603,6 +742,18 @@ class TFCvtStage(tf.keras.layers.Layer):
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
return hidden_state, cls_token return hidden_state, cls_token
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding", None) is not None:
with tf.name_scope(self.embedding.name):
self.embedding.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFCvtEncoder(tf.keras.layers.Layer): class TFCvtEncoder(tf.keras.layers.Layer):
""" """
...@@ -655,6 +806,15 @@ class TFCvtEncoder(tf.keras.layers.Layer): ...@@ -655,6 +806,15 @@ class TFCvtEncoder(tf.keras.layers.Layer):
hidden_states=all_hidden_states, hidden_states=all_hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "stages", None) is not None:
for layer in self.stages:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFCvtMainLayer(tf.keras.layers.Layer): class TFCvtMainLayer(tf.keras.layers.Layer):
...@@ -696,6 +856,14 @@ class TFCvtMainLayer(tf.keras.layers.Layer): ...@@ -696,6 +856,14 @@ class TFCvtMainLayer(tf.keras.layers.Layer):
hidden_states=encoder_outputs.hidden_states, hidden_states=encoder_outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
class TFCvtPreTrainedModel(TFPreTrainedModel): class TFCvtPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -815,6 +983,14 @@ class TFCvtModel(TFCvtPreTrainedModel): ...@@ -815,6 +983,14 @@ class TFCvtModel(TFCvtPreTrainedModel):
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "cvt", None) is not None:
with tf.name_scope(self.cvt.name):
self.cvt.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -840,6 +1016,7 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification ...@@ -840,6 +1016,7 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification
bias_initializer="zeros", bias_initializer="zeros",
name="classifier", name="classifier",
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
...@@ -909,3 +1086,18 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification ...@@ -909,3 +1086,18 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "cvt", None) is not None:
with tf.name_scope(self.cvt.name):
self.cvt.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.embed_dim[-1]])
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.embed_dim[-1]])
...@@ -84,7 +84,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -84,7 +84,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.dropout) self.dropout = tf.keras.layers.Dropout(rate=config.dropout)
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
...@@ -99,7 +99,12 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -99,7 +99,12 @@ class TFEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(initializer_range=self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.dim])
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
""" """
...@@ -152,6 +157,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): ...@@ -152,6 +157,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
) )
self.pruned_heads = set() self.pruned_heads = set()
self.config = config
def prune_heads(self, heads): def prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
...@@ -212,6 +218,23 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): ...@@ -212,6 +218,23 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
else: else:
return (context,) return (context,)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_lin", None) is not None:
with tf.name_scope(self.q_lin.name):
self.q_lin.build([None, None, self.config.dim])
if getattr(self, "k_lin", None) is not None:
with tf.name_scope(self.k_lin.name):
self.k_lin.build([None, None, self.config.dim])
if getattr(self, "v_lin", None) is not None:
with tf.name_scope(self.v_lin.name):
self.v_lin.build([None, None, self.config.dim])
if getattr(self, "out_lin", None) is not None:
with tf.name_scope(self.out_lin.name):
self.out_lin.build([None, None, self.config.dim])
class TFFFN(tf.keras.layers.Layer): class TFFFN(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -224,6 +247,7 @@ class TFFFN(tf.keras.layers.Layer): ...@@ -224,6 +247,7 @@ class TFFFN(tf.keras.layers.Layer):
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
) )
self.activation = get_tf_activation(config.activation) self.activation = get_tf_activation(config.activation)
self.config = config
def call(self, input, training=False): def call(self, input, training=False):
x = self.lin1(input) x = self.lin1(input)
...@@ -232,6 +256,17 @@ class TFFFN(tf.keras.layers.Layer): ...@@ -232,6 +256,17 @@ class TFFFN(tf.keras.layers.Layer):
x = self.dropout(x, training=training) x = self.dropout(x, training=training)
return x return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.config.dim])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.config.hidden_dim])
class TFTransformerBlock(tf.keras.layers.Layer): class TFTransformerBlock(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -253,6 +288,7 @@ class TFTransformerBlock(tf.keras.layers.Layer): ...@@ -253,6 +288,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
self.ffn = TFFFN(config, name="ffn") self.ffn = TFFFN(config, name="ffn")
self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
self.config = config
def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None
""" """
...@@ -281,6 +317,23 @@ class TFTransformerBlock(tf.keras.layers.Layer): ...@@ -281,6 +317,23 @@ class TFTransformerBlock(tf.keras.layers.Layer):
output = (sa_weights,) + output output = (sa_weights,) + output
return output return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "sa_layer_norm", None) is not None:
with tf.name_scope(self.sa_layer_norm.name):
self.sa_layer_norm.build([None, None, self.config.dim])
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build(None)
if getattr(self, "output_layer_norm", None) is not None:
with tf.name_scope(self.output_layer_norm.name):
self.output_layer_norm.build([None, None, self.config.dim])
class TFTransformer(tf.keras.layers.Layer): class TFTransformer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -336,6 +389,15 @@ class TFTransformer(tf.keras.layers.Layer): ...@@ -336,6 +389,15 @@ class TFTransformer(tf.keras.layers.Layer):
last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFDistilBertMainLayer(tf.keras.layers.Layer): class TFDistilBertMainLayer(tf.keras.layers.Layer):
...@@ -412,6 +474,17 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -412,6 +474,17 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class TFDistilBertPreTrainedModel(TFPreTrainedModel): class TFDistilBertPreTrainedModel(TFPreTrainedModel):
...@@ -548,6 +621,14 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -548,6 +621,14 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
) )
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
class TFDistilBertLMHead(tf.keras.layers.Layer): class TFDistilBertLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
...@@ -667,6 +748,23 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel ...@@ -667,6 +748,23 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
attentions=distilbert_output.attentions, attentions=distilbert_output.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "vocab_transform", None) is not None:
with tf.name_scope(self.vocab_transform.name):
self.vocab_transform.build([None, None, self.config.dim])
if getattr(self, "vocab_layer_norm", None) is not None:
with tf.name_scope(self.vocab_layer_norm.name):
self.vocab_layer_norm.build([None, None, self.config.dim])
if getattr(self, "vocab_projector", None) is not None:
with tf.name_scope(self.vocab_projector.name):
self.vocab_projector.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -691,6 +789,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ...@@ -691,6 +789,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -746,6 +845,20 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ...@@ -746,6 +845,20 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
attentions=distilbert_output.attentions, attentions=distilbert_output.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "pre_classifier", None) is not None:
with tf.name_scope(self.pre_classifier.name):
self.pre_classifier.build([None, None, self.config.dim])
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.dim])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -764,6 +877,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -764,6 +877,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -814,6 +928,17 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -814,6 +928,17 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -837,6 +962,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -837,6 +962,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
...@@ -908,6 +1034,20 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -908,6 +1034,20 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
attentions=distilbert_output.attentions, attentions=distilbert_output.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "pre_classifier", None) is not None:
with tf.name_scope(self.pre_classifier.name):
self.pre_classifier.build([None, None, self.config.dim])
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.dim])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -926,6 +1066,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ...@@ -926,6 +1066,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
) )
assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2" assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2"
self.dropout = tf.keras.layers.Dropout(config.qa_dropout) self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -991,3 +1132,14 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ...@@ -991,3 +1132,14 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
hidden_states=distilbert_output.hidden_states, hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions, attentions=distilbert_output.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.dim])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment