Unverified Commit 050e0b44 authored by Matt's avatar Matt Committed by GitHub
Browse files

Proper build() methods for TF (#27794)

* Add a convenience method for building in your own name scope

* Second attempt at auto layer building

* Revert "Second attempt at auto layer building"

This reverts commit e03a3aaecf9ec41a805582b83cbdfe3290a631be.

* Attempt #3

* Revert "Attempt #3"

This reverts commit b9df7a0857560d29b5abbed6127d9e9eca77cf47.

* Add missing attributes that we're going to need later

* Add some attributes we're going to need later

* A fourth attempt! Feel the power flow through you!

* Revert "A fourth attempt! Feel the power flow through you!"

This reverts commit 6bf4aaf3875d6f28485f50187617a4c616c8aff7.

* Add more values we'll need later

* TF refactor that we'll need later

* Revert "TF refactor that we'll need later"

This reverts commit ca07202fb5b7b7436b893baa8d688b4f348ea7b9.

* Revert "Revert "TF refactor that we'll need later""

This reverts commit 1beb0f39f293ed9c27594575e1c849aadeb15c13.

* make fixup

* Attempt five!

* Revert "Attempt five!"

This reverts commit 3302207958dfd0374b0447a51c06eea51a506044.

* Attempt six - this time don't add empty methods

* Revert "Attempt six - this time don't add empty methods"

This reverts commit 67d60129be75416b6beb8f47c7d38d77b18d79bb.

* Attempt seven - better base model class detection!

* Revert "Attempt seven - better base model class detection!"

This reverts commit 5f14845e92ea0e87c598da933bfbfee10f553bc9.

* Another attribute we'll need later

* Try again with the missing attribute!

* Revert "Try again with the missing attribute!"

This reverts commit 760c6f30c5dffb3e04b0e73c34a77d1882a0fef7.

* This is the attempt that will pierce the heavens!

* Revert "This is the attempt that will pierce the heavens!"

This reverts commit c868bb657de057aca7a5260350a3f831fc4dfee6.

* Attempt seven - snag list is steadily decreasing

* Revert "Attempt seven - snag list is steadily decreasing"

This reverts commit 46fbd975deda64429bfb3e5fac4fc0370c00d316.

* Attempt eight - will an empty snag list do it?

* Revert "Attempt eight - will an empty snag list do it?"

This reverts commit 7c8a3c2b083253649569e9877e02054ae5cec67b.

* Fixes to Hubert issues that cause problems later

* Trying again with Conv1D/SeparableConv fixes

* Revert "Trying again with Conv1D/SeparableConv fixes"

This reverts commit 55092bca952bc0f750aa1ffe246a640bf1e2036e.

* Apply the build shape fixes to Wav2Vec2 as well

* One more attempt!

* Revert "One more attempt!"

This reverts commit 5ac3e4cb01b9458cc93312873725f9444ae7261c.

* Another attempt!

* Revert "Another attempt!"

This reverts commit ea16d890e019d7de8792a3b8e72f3b1c02adae50.

* Let's see how many failures we get without the internal build method

* Fix OpenAI

* Fix MobileBERT

* (Mostly) fix GroupVIT

* Fix BLIP

* One more BLIP fix

* One more BLIP fix!

* Fix Regnet

* Finally fully fix GroupViT

* Fix Data2Vec and add the new AdaptivePool

* Fix Segformer

* Fix Albert

* Fix Deberta/DebertaV2

* Fix XLM

* Actually fix XLM

* Fix Flaubert

* Fix lxmert

* Fix Resnet

* Fix ConvBERT

* Fix ESM

* Fix Convnext / ConvnextV2

* Fix SAM

* Fix Efficientformer

* Fix LayoutLMv3

* Fix speech_to_text

* Fix mpnet and mobilevit

* Fix Swin

* Fix CTRL

* Fix CVT

* Fix DPR

* Fix Wav2Vec2

* Fix T5

* Fix Hubert

* Fix GPT2

* Fix Whisper

* Fix DeiT

* Fix the encoder-decoder / dual-encoder classes

* make fix-copies

* build in name scope

* Fix summarization test

* Fix tied weight names for BART + Blenderbot

* Fix tied weight name building

* Fix to TFESM weight building

* Update TF SAM

* Expand all the shapes out into Big Boy Shapes
parent 52c37882
......@@ -35,7 +35,6 @@ import tensorflow as tf
from huggingface_hub import Repository, list_repo_files
from keras import backend as K
from packaging.version import parse
from tensorflow.python.util.keras_deps import get_call_context_function
from . import DataCollatorWithPadding, DefaultDataCollator
from .activations_tf import get_tf_activation
......@@ -1122,6 +1121,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
)
return dummies
def build_in_name_scope(self):
with tf.name_scope(self.name):
self.build(input_shape=None)
@property
def framework(self) -> str:
"""
......@@ -1130,15 +1133,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return "tf"
def build(self, input_shape=None):
call_context = get_call_context_function()
if self.built or call_context().in_call:
self.built = True
else:
self.built = True
# Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
# Setting it in build() allows users to override the shape when loading a non-pretrained model from config
self._set_save_spec(self.input_signature)
self(self.dummy_inputs, training=False)
pass # This is just here to make sure we don't call the superclass build()
def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
......@@ -1869,7 +1864,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
main_layer.set_input_embeddings(value)
except AttributeError:
logger.info("Building the model")
self.build()
self.build_in_name_scope()
main_layer.set_input_embeddings(value)
def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
......@@ -1886,7 +1881,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return lm_head.get_output_embeddings()
except AttributeError:
logger.info("Building the model")
self.build()
self.build_in_name_scope()
return lm_head().get_output_embeddings()
......@@ -1906,7 +1901,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
lm_head.set_output_embeddings(value)
except AttributeError:
logger.info("Building the model")
self.build()
self.build_in_name_scope()
lm_head.set_output_embeddings(value)
def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
......@@ -1944,7 +1939,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
try:
return lm_head.get_bias()
except AttributeError:
self.build()
self.build_in_name_scope()
return lm_head.get_bias()
return None
......@@ -1962,7 +1957,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
try:
lm_head.set_bias(value)
except AttributeError:
self.build()
self.build_in_name_scope()
lm_head.set_bias(value)
def get_lm_head(self) -> tf.keras.layers.Layer:
......@@ -2049,7 +2044,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
# The reason why the attributes don't exist might be
# because the model is not built, so retry getting
# the argument after building the model
model.build()
model.build_in_name_scope()
embeds = getattr(embedding_layer, "weight", None)
if embeds is not None:
......@@ -2914,9 +2909,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
# we might need to extend the variable scope for composite models
if load_weight_prefix is not None:
with tf.compat.v1.variable_scope(load_weight_prefix):
model.build() # build the network with dummy inputs
model.build_in_name_scope() # build the network with dummy inputs
else:
model.build() # build the network with dummy inputs
model.build_in_name_scope() # build the network with dummy inputs
if safetensors_from_pt:
from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model
......@@ -3215,6 +3210,9 @@ class TFConv1D(tf.keras.layers.Layer):
self.initializer_range = initializer_range
def build(self, input_shape):
if self.built:
return
self.built = True
self.weight = self.add_weight(
"weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
)
......@@ -3398,6 +3396,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
if self.has_last_dropout:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
self.hidden_size = config.hidden_size
def call(self, inputs, cls_index=None, training=False):
if not isinstance(inputs, (dict, tuple, list)):
......@@ -3450,6 +3449,14 @@ class TFSequenceSummary(tf.keras.layers.Layer):
return output
def build(self, input_shape):
if self.built:
return
self.built = True
if getattr(self, "summary", None) is not None:
with tf.name_scope("summary"):
self.summary.build(self.hidden_size)
def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal:
"""
......
......@@ -146,7 +146,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -168,7 +168,12 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call(
......@@ -246,6 +251,7 @@ class TFAlbertAttention(tf.keras.layers.Layer):
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -307,6 +313,26 @@ class TFAlbertAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFAlbertLayer(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
......@@ -329,6 +355,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(
self,
......@@ -356,6 +383,23 @@ class TFAlbertLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build([None, None, self.config.hidden_size])
if getattr(self, "ffn_output", None) is not None:
with tf.name_scope(self.ffn_output.name):
self.ffn_output.build([None, None, self.config.intermediate_size])
if getattr(self, "full_layer_layer_norm", None) is not None:
with tf.name_scope(self.full_layer_layer_norm.name):
self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
class TFAlbertLayerGroup(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
......@@ -399,6 +443,15 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert_layers", None) is not None:
for layer in self.albert_layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFAlbertTransformer(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
......@@ -416,6 +469,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
self.albert_layer_groups = [
TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups)
]
self.config = config
def call(
self,
......@@ -457,6 +511,18 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding_hidden_mapping_in", None) is not None:
with tf.name_scope(self.embedding_hidden_mapping_in.name):
self.embedding_hidden_mapping_in.build([None, None, self.config.embedding_size])
if getattr(self, "albert_layer_groups", None) is not None:
for layer in self.albert_layer_groups:
with tf.name_scope(layer.name):
layer.build(None)
class TFAlbertPreTrainedModel(TFPreTrainedModel):
"""
......@@ -488,13 +554,21 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
self.decoder_bias = self.add_weight(
shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self.decoder
......@@ -650,6 +724,20 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build([None, None, self.config.hidden_size])
@dataclass
class TFAlbertForPreTrainingOutput(ModelOutput):
......@@ -825,6 +913,14 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
@add_start_docstrings(
"""
......@@ -921,6 +1017,20 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
if getattr(self, "sop_classifier", None) is not None:
with tf.name_scope(self.sop_classifier.name):
self.sop_classifier.build(None)
class TFAlbertSOPHead(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
......@@ -932,6 +1042,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor:
dropout_pooled_output = self.dropout(inputs=pooled_output, training=training)
......@@ -939,6 +1050,14 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
return logits
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
......@@ -1035,6 +1154,17 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@add_start_docstrings(
"""
......@@ -1058,6 +1188,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1117,6 +1248,17 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1145,6 +1287,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1200,6 +1343,17 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1221,6 +1375,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
self.qa_outputs = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1295,6 +1450,17 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1316,6 +1482,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
self.classifier = tf.keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
......@@ -1394,3 +1561,14 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "albert", None) is not None:
with tf.name_scope(self.albert.name):
self.albert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
......@@ -43,7 +43,6 @@ from ...modeling_tf_utils import (
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ContextManagers,
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
......@@ -296,6 +295,23 @@ class TFBartAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFBartEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs):
......@@ -311,6 +327,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -352,6 +369,26 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartDecoderLayer(tf.keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs):
......@@ -380,6 +417,7 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -461,6 +499,32 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
present_key_value,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
......@@ -470,6 +534,8 @@ class TFBartClassificationHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(inner_dim, name="dense")
self.dropout = tf.keras.layers.Dropout(pooler_dropout)
self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj")
self.input_dim = inner_dim
self.inner_dim = inner_dim
def call(self, inputs):
hidden_states = self.dropout(inputs)
......@@ -479,6 +545,17 @@ class TFBartClassificationHead(tf.keras.layers.Layer):
hidden_states = self.out_proj(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.input_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.inner_dim])
class TFBartPretrainedModel(TFPreTrainedModel):
config_class = BartConfig
......@@ -686,6 +763,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
)
self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model
@unpack_inputs
def call(
......@@ -745,16 +823,8 @@ class TFBartEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos
......@@ -809,6 +879,21 @@ class TFBartEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.embed_dim])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFBartDecoder(tf.keras.layers.Layer):
......@@ -938,16 +1023,8 @@ class TFBartDecoder(tf.keras.layers.Layer):
positions = self.embed_positions(input_shape, position_ids=position_ids)
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
hidden_states = inputs_embeds
......@@ -1032,6 +1109,21 @@ class TFBartDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFBartMainLayer(tf.keras.layers.Layer):
......@@ -1149,6 +1241,22 @@ class TFBartMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare BART Model outputting raw hidden-states without any specific head on top.",
......@@ -1237,6 +1345,14 @@ class TFBartModel(TFBartPretrainedModel):
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
class BiasLayer(tf.keras.layers.Layer):
"""
......@@ -1440,6 +1556,17 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode
def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
@add_start_docstrings(
"""
......@@ -1567,3 +1694,14 @@ class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassific
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "classification_head", None) is not None:
with tf.name_scope(self.classification_head.name):
self.classification_head.build(None)
......@@ -156,7 +156,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -178,7 +178,12 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def call(
self,
......@@ -248,6 +253,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -337,6 +343,20 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFBertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -347,6 +367,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -355,6 +376,17 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertAttention(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -395,6 +427,17 @@ class TFBertAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFBertIntermediate(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -408,6 +451,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -415,6 +459,14 @@ class TFBertIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFBertOutput(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -425,6 +477,7 @@ class TFBertOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -433,6 +486,17 @@ class TFBertOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLayer(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -519,6 +583,23 @@ class TFBertLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
class TFBertEncoder(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -588,6 +669,15 @@ class TFBertEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFBertPooler(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -599,6 +689,7 @@ class TFBertPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -608,6 +699,14 @@ class TFBertPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -625,6 +724,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -633,6 +733,17 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
......@@ -647,10 +758,15 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self.input_embeddings
......@@ -688,6 +804,14 @@ class TFBertMLMHead(tf.keras.layers.Layer):
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
class TFBertNSPHead(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
......@@ -698,12 +822,21 @@ class TFBertNSPHead(tf.keras.layers.Layer):
kernel_initializer=get_initializer(config.initializer_range),
name="seq_relationship",
)
self.config = config
def call(self, pooled_output: tf.Tensor) -> tf.Tensor:
seq_relationship_score = self.seq_relationship(inputs=pooled_output)
return seq_relationship_score
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "seq_relationship", None) is not None:
with tf.name_scope(self.seq_relationship.name):
self.seq_relationship.build([None, None, self.config.hidden_size])
@keras_serializable
class TFBertMainLayer(tf.keras.layers.Layer):
......@@ -891,6 +1024,20 @@ class TFBertMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFBertPreTrainedModel(TFPreTrainedModel):
"""
......@@ -1103,6 +1250,14 @@ class TFBertModel(TFBertPreTrainedModel):
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
@add_start_docstrings(
"""
......@@ -1215,6 +1370,20 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "nsp", None) is not None:
with tf.name_scope(self.nsp.name):
self.nsp.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
......@@ -1301,6 +1470,17 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
......@@ -1426,6 +1606,17 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top.""",
......@@ -1508,6 +1699,17 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "nsp", None) is not None:
with tf.name_scope(self.nsp.name):
self.nsp.build(None)
@add_start_docstrings(
"""
......@@ -1536,6 +1738,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1594,6 +1797,17 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1615,6 +1829,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
self.classifier = tf.keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
......@@ -1693,6 +1908,17 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1727,6 +1953,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1783,6 +2010,17 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1812,6 +2050,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1884,3 +2123,14 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -41,7 +41,6 @@ from ...modeling_tf_utils import (
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ContextManagers,
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
......@@ -291,6 +290,23 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot
class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
......@@ -307,6 +323,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -348,6 +365,26 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot
class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
......@@ -377,6 +414,7 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -458,6 +496,32 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
present_key_value,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotPreTrainedModel(TFPreTrainedModel):
config_class = BlenderbotConfig
......@@ -711,16 +775,8 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos
......@@ -776,6 +832,21 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFBlenderbotDecoder(tf.keras.layers.Layer):
......@@ -916,12 +987,8 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
positions = self.embed_positions(input_shape, position_ids=position_ids)
if inputs_embeds is None:
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
hidden_states = inputs_embeds
......@@ -1006,6 +1073,21 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFBlenderbotMainLayer(tf.keras.layers.Layer):
......@@ -1114,6 +1196,22 @@ class TFBlenderbotMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.",
......@@ -1217,6 +1315,14 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel):
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer):
......@@ -1436,3 +1542,14 @@ class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausal
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
......@@ -40,7 +40,6 @@ from ...modeling_tf_utils import (
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ContextManagers,
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
......@@ -291,6 +290,23 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
......@@ -307,6 +323,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -348,6 +365,26 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
......@@ -377,6 +414,7 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -458,6 +496,32 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
present_key_value,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel):
config_class = BlenderbotSmallConfig
......@@ -646,6 +710,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
)
self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model
def get_embed_tokens(self):
return self.embed_tokens
......@@ -717,16 +782,8 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos
......@@ -781,6 +838,21 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.embed_dim])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
......@@ -917,16 +989,8 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
if input_shape[-1] > 1:
......@@ -1014,6 +1078,21 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layernorm_embedding", None) is not None:
with tf.name_scope(self.layernorm_embedding.name):
self.layernorm_embedding.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer):
......@@ -1122,6 +1201,22 @@ class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare BLENDERBOT_SMALL Model outputting raw hidden-states without any specific head on top.",
......@@ -1209,6 +1304,14 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel):
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer):
......@@ -1413,3 +1516,14 @@ class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
......@@ -254,7 +254,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
def build(self, input_shape):
def build(self, input_shape=None):
self.class_embedding = self.add_weight(
shape=(1, 1, self.embed_dim),
initializer=get_initializer(self.config.initializer_range),
......@@ -268,7 +268,13 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
trainable=True,
name="position_embedding",
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "patch_embedding", None) is not None:
with tf.name_scope(self.patch_embedding.name):
self.patch_embedding.build([None, None, None, 3])
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
# Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch
......@@ -412,6 +418,20 @@ class TFBlipAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "qkv", None) is not None:
with tf.name_scope(self.qkv.name):
self.qkv.build([None, None, self.embed_dim])
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.embed_dim])
class TFBlipMLP(tf.keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs):
......@@ -428,6 +448,7 @@ class TFBlipMLP(tf.keras.layers.Layer):
self.fc2 = tf.keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.fc1(inputs=hidden_states)
......@@ -435,6 +456,17 @@ class TFBlipMLP(tf.keras.layers.Layer):
hidden_states = self.fc2(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.config.hidden_size])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.intermediate_size])
class TFBlipEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs):
......@@ -485,6 +517,23 @@ class TFBlipEncoderLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, self.embed_dim])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, self.embed_dim])
class TFBlipPreTrainedModel(TFPreTrainedModel):
"""
......@@ -645,6 +694,15 @@ class TFBlipEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFBlipVisionModel(TFBlipPreTrainedModel):
main_input_name = "pixel_values"
......@@ -657,6 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
self.encoder = TFBlipEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
......@@ -724,6 +783,20 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
def get_input_embeddings(self):
return self.embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "post_layernorm", None) is not None:
with tf.name_scope(self.post_layernorm.name):
self.post_layernorm.build([None, None, self.embed_dim])
class TFBlipMainLayer(tf.keras.layers.Layer):
config_class = BlipConfig
......@@ -775,7 +848,22 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True,
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection.name):
self.visual_projection.build([None, None, self.vision_embed_dim])
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection.name):
self.text_projection.build([None, None, self.text_embed_dim])
@unpack_inputs
def call(
......@@ -995,6 +1083,14 @@ class TFBlipModel(TFBlipPreTrainedModel):
return image_features
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "blip", None) is not None:
with tf.name_scope(self.blip.name):
self.blip.build(None)
@add_start_docstrings(
"""
......@@ -1168,6 +1264,17 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "text_decoder", None) is not None:
with tf.name_scope(self.text_decoder.name):
self.text_decoder.build(None)
@add_start_docstrings(
"""
......@@ -1409,6 +1516,20 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "text_encoder", None) is not None:
with tf.name_scope(self.text_encoder.name):
self.text_encoder.build(None)
if getattr(self, "text_decoder", None) is not None:
with tf.name_scope(self.text_decoder.name):
self.text_decoder.build(None)
@add_start_docstrings(
"""
......@@ -1457,6 +1578,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
if not hasattr(config, "decoder_start_token_id")
else config.decoder_start_token_id
)
self.config = config
def get_input_embeddings(self) -> tf.keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding
......@@ -1558,3 +1680,23 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
attentions=vision_outputs.attentions,
question_embeds=question_embeds,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "text_encoder", None) is not None:
with tf.name_scope(self.text_encoder.name):
self.text_encoder.build(None)
if getattr(self, "vision_proj", None) is not None:
with tf.name_scope(self.vision_proj.name):
self.vision_proj.build([None, None, self.config.vision_config.hidden_size])
if getattr(self, "text_proj", None) is not None:
with tf.name_scope(self.text_proj.name):
self.text_proj.build([None, None, self.config.text_config.hidden_size])
if getattr(self, "itm_head", None) is not None:
with tf.name_scope(self.itm_head.name):
self.itm_head.build([None, None, self.config.text_config.hidden_size])
......@@ -127,6 +127,23 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
embeddings = self.dropout(embeddings, training=training)
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "word_embeddings", None) is not None:
with tf.name_scope(self.word_embeddings.name):
self.word_embeddings.build(None)
if getattr(self, "position_embeddings", None) is not None:
with tf.name_scope(self.position_embeddings.name):
self.position_embeddings.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
class TFBlipTextSelfAttention(tf.keras.layers.Layer):
......@@ -160,6 +177,7 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.distance_embedding = tf.keras.layers.Embedding(
2 * config.max_position_embeddings - 1, self.attention_head_size
)
self.is_cross_attention = is_cross_attention
def transpose_for_scores(self, x):
new_x_shape = tf.concat(
......@@ -250,6 +268,28 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if self.is_cross_attention:
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.encoder_hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.encoder_hidden_size])
else:
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFBlipTextSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
......@@ -260,6 +300,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -268,6 +309,17 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
class TFBlipTextAttention(tf.keras.layers.Layer):
......@@ -302,6 +354,17 @@ class TFBlipTextAttention(tf.keras.layers.Layer):
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
class TFBlipTextIntermediate(tf.keras.layers.Layer):
......@@ -316,6 +379,7 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -323,6 +387,14 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFBlipTextOutput(tf.keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
......@@ -333,6 +405,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -341,6 +414,17 @@ class TFBlipTextOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -400,6 +484,23 @@ class TFBlipTextLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
@keras_serializable
......@@ -481,6 +582,15 @@ class TFBlipTextEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
class TFBlipTextPooler(tf.keras.layers.Layer):
......@@ -493,6 +603,7 @@ class TFBlipTextPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -502,6 +613,14 @@ class TFBlipTextPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
......@@ -520,6 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -528,6 +648,17 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -546,7 +677,16 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
def build(self, input_shape=None):
self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build([None, None, self.config.hidden_size])
def call(self, hidden_states):
hidden_states = self.transform(hidden_states)
......@@ -563,6 +703,14 @@ class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
class TFBlipTextPreTrainedModel(TFPreTrainedModel):
......@@ -802,6 +950,20 @@ class TFBlipTextModel(TFBlipTextPreTrainedModel):
cross_attentions=encoder_outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
......@@ -942,3 +1104,14 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
for layer_past in past_key_values:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "bert", None) is not None:
with tf.name_scope(self.bert.name):
self.bert.build(None)
if getattr(self, "cls", None) is not None:
with tf.name_scope(self.cls.name):
self.cls.build(None)
......@@ -184,7 +184,7 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -206,7 +206,12 @@ class TFCamembertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
"""
......@@ -279,6 +284,7 @@ class TFCamembertPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -288,6 +294,14 @@ class TFCamembertPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert
class TFCamembertSelfAttention(tf.keras.layers.Layer):
......@@ -317,6 +331,7 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -406,6 +421,20 @@ class TFCamembertSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert
class TFCamembertSelfOutput(tf.keras.layers.Layer):
......@@ -417,6 +446,7 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -425,6 +455,17 @@ class TFCamembertSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert
class TFCamembertAttention(tf.keras.layers.Layer):
......@@ -466,6 +507,17 @@ class TFCamembertAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert
class TFCamembertIntermediate(tf.keras.layers.Layer):
......@@ -480,6 +532,7 @@ class TFCamembertIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -487,6 +540,14 @@ class TFCamembertIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert
class TFCamembertOutput(tf.keras.layers.Layer):
......@@ -498,6 +559,7 @@ class TFCamembertOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -506,6 +568,17 @@ class TFCamembertOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert
class TFCamembertLayer(tf.keras.layers.Layer):
......@@ -593,6 +666,23 @@ class TFCamembertLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert
class TFCamembertEncoder(tf.keras.layers.Layer):
......@@ -663,6 +753,15 @@ class TFCamembertEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert
......@@ -861,6 +960,20 @@ class TFCamembertMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
class TFCamembertPreTrainedModel(TFPreTrainedModel):
"""
......@@ -945,6 +1058,14 @@ class TFCamembertModel(TFCamembertPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert
class TFCamembertLMHead(tf.keras.layers.Layer):
......@@ -965,10 +1086,18 @@ class TFCamembertLMHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
......@@ -1080,6 +1209,17 @@ class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelin
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead
class TFCamembertClassificationHead(tf.keras.layers.Layer):
......@@ -1100,6 +1240,7 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer):
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
......@@ -1109,6 +1250,17 @@ class TFCamembertClassificationHead(tf.keras.layers.Layer):
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1186,6 +1338,17 @@ class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenc
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
......@@ -1212,6 +1375,7 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1270,6 +1434,17 @@ class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClass
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1292,6 +1467,7 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
......@@ -1363,6 +1539,17 @@ class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceL
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1384,6 +1571,7 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1456,6 +1644,17 @@ class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsw
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
......@@ -1581,3 +1780,14 @@ class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelin
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
......@@ -169,7 +169,12 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
name="embeddings",
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "patch_embedding", None) is not None:
with tf.name_scope(self.patch_embedding.name):
self.patch_embedding.build([None, None, None, self.config.num_channels])
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
"""`pixel_values` is expected to be of NCHW format."""
......@@ -352,6 +357,23 @@ class TFCLIPAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFCLIPMLP(tf.keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
......@@ -369,6 +391,7 @@ class TFCLIPMLP(tf.keras.layers.Layer):
self.fc2 = tf.keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.fc1(inputs=hidden_states)
......@@ -376,6 +399,17 @@ class TFCLIPMLP(tf.keras.layers.Layer):
hidden_states = self.fc2(inputs=hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.config.hidden_size])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.intermediate_size])
class TFCLIPEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: CLIPConfig, **kwargs):
......@@ -428,6 +462,23 @@ class TFCLIPEncoderLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, self.embed_dim])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, self.embed_dim])
class TFCLIPEncoder(tf.keras.layers.Layer):
"""
......@@ -483,6 +534,15 @@ class TFCLIPEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFCLIPTextTransformer(tf.keras.layers.Layer):
def __init__(self, config: CLIPTextConfig, **kwargs):
......@@ -496,6 +556,7 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
# For `pooled_output` computation
self.eos_token_id = config.eos_token_id
self.embed_dim = config.hidden_size
def call(
self,
......@@ -586,6 +647,20 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
@keras_serializable
class TFCLIPTextMainLayer(tf.keras.layers.Layer):
......@@ -634,6 +709,14 @@ class TFCLIPTextMainLayer(tf.keras.layers.Layer):
return text_model_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
class TFCLIPVisionTransformer(tf.keras.layers.Layer):
def __init__(self, config: CLIPVisionConfig, **kwargs):
......@@ -643,6 +726,7 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer):
self.pre_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
self.encoder = TFCLIPEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size
def call(
self,
......@@ -679,6 +763,23 @@ class TFCLIPVisionTransformer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "pre_layernorm", None) is not None:
with tf.name_scope(self.pre_layernorm.name):
self.pre_layernorm.build([None, None, self.embed_dim])
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "post_layernorm", None) is not None:
with tf.name_scope(self.post_layernorm.name):
self.post_layernorm.build([None, self.embed_dim])
@keras_serializable
class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
......@@ -714,6 +815,14 @@ class TFCLIPVisionMainLayer(tf.keras.layers.Layer):
return vision_model_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
@keras_serializable
class TFCLIPMainLayer(tf.keras.layers.Layer):
......@@ -757,6 +866,8 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
use_bias=False,
name="text_projection",
)
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
def build(self, input_shape: tf.TensorShape = None):
self.logit_scale = self.add_weight(
......@@ -766,7 +877,21 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
name="logit_scale",
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "text_model", None) is not None:
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
if getattr(self, "vision_model", None) is not None:
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection.name):
self.visual_projection.build([None, None, self.vision_embed_dim])
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection.name):
self.text_projection.build([None, None, self.text_embed_dim])
@unpack_inputs
def get_text_features(
......@@ -1108,6 +1233,14 @@ class TFCLIPTextModel(TFCLIPPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
class TFCLIPVisionModel(TFCLIPPreTrainedModel):
config_class = CLIPVisionConfig
......@@ -1162,6 +1295,14 @@ class TFCLIPVisionModel(TFCLIPPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
@add_start_docstrings(CLIP_START_DOCSTRING)
class TFCLIPModel(TFCLIPPreTrainedModel):
......@@ -1313,3 +1454,11 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
# TensorFlow cannot trace through nested dataclasses. Reference:
# https://github.com/huggingface/transformers/pull/16886
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "clip", None) is not None:
with tf.name_scope(self.clip.name):
self.clip.build(None)
......@@ -81,7 +81,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -103,7 +103,12 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call(
......@@ -208,6 +213,7 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, x, batch_size):
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -297,6 +303,29 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
if getattr(self, "key_conv_attn_layer", None) is not None:
with tf.name_scope(self.key_conv_attn_layer.name):
self.key_conv_attn_layer.build([None, None, self.config.hidden_size])
if getattr(self, "conv_kernel_layer", None) is not None:
with tf.name_scope(self.conv_kernel_layer.name):
self.conv_kernel_layer.build([None, None, self.all_head_size])
if getattr(self, "conv_out_layer", None) is not None:
with tf.name_scope(self.conv_out_layer.name):
self.conv_out_layer.build([None, None, self.config.hidden_size])
class TFConvBertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -307,6 +336,7 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states)
......@@ -315,6 +345,17 @@ class TFConvBertSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFConvBertAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -335,6 +376,17 @@ class TFConvBertAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class GroupedLinearLayer(tf.keras.layers.Layer):
def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs):
......@@ -389,6 +441,7 @@ class TFConvBertIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
......@@ -396,6 +449,14 @@ class TFConvBertIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFConvBertOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -415,6 +476,7 @@ class TFConvBertOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states)
......@@ -423,6 +485,17 @@ class TFConvBertOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
class TFConvBertLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -443,6 +516,20 @@ class TFConvBertLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
class TFConvBertEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -486,6 +573,15 @@ class TFConvBertEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -501,6 +597,7 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
......@@ -509,6 +606,17 @@ class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
@keras_serializable
class TFConvBertMainLayer(tf.keras.layers.Layer):
......@@ -616,6 +724,20 @@ class TFConvBertMainLayer(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "embeddings_project", None) is not None:
with tf.name_scope(self.embeddings_project.name):
self.embeddings_project.build([None, None, self.config.embedding_size])
class TFConvBertPreTrainedModel(TFPreTrainedModel):
"""
......@@ -770,6 +892,14 @@ class TFConvBertModel(TFConvBertPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
......@@ -814,6 +944,7 @@ class TFConvBertGeneratorPredictions(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense")
self.config = config
def call(self, generator_hidden_states, training=False):
hidden_states = self.dense(generator_hidden_states)
......@@ -822,6 +953,17 @@ class TFConvBertGeneratorPredictions(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingLoss):
......@@ -901,6 +1043,20 @@ class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingL
attentions=generator_hidden_states.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "generator_predictions", None) is not None:
with tf.name_scope(self.generator_predictions.name):
self.generator_predictions.build(None)
if getattr(self, "generator_lm_head", None) is not None:
with tf.name_scope(self.generator_lm_head.name):
self.generator_lm_head.build(None)
class TFConvBertClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
......@@ -931,6 +1087,17 @@ class TFConvBertClassificationHead(tf.keras.layers.Layer):
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -999,6 +1166,17 @@ class TFConvBertForSequenceClassification(TFConvBertPreTrainedModel, TFSequenceC
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
......@@ -1018,6 +1196,7 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
......@@ -1092,6 +1271,20 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1113,6 +1306,7 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1167,6 +1361,17 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1184,6 +1389,7 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1252,3 +1458,14 @@ class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnswer
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convbert", None) is not None:
with tf.name_scope(self.convbert.name):
self.convbert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -81,6 +81,7 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
)
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels
self.config = config
def call(self, pixel_values):
if isinstance(pixel_values, dict):
......@@ -101,6 +102,17 @@ class TFConvNextEmbeddings(tf.keras.layers.Layer):
embeddings = self.layernorm(embeddings)
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build([None, None, None, self.config.num_channels])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextLayer(tf.keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation.
......@@ -167,7 +179,25 @@ class TFConvNextLayer(tf.keras.layers.Layer):
if self.config.layer_scale_init_value > 0
else None
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dwconv", None) is not None:
with tf.name_scope(self.dwconv.name):
self.dwconv.build([None, None, None, self.dim])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.dim])
if getattr(self, "pwconv1", None) is not None:
with tf.name_scope(self.pwconv1.name):
self.pwconv1.build([None, None, self.dim])
if getattr(self, "pwconv2", None) is not None:
with tf.name_scope(self.pwconv2.name):
self.pwconv2.build([None, None, 4 * self.dim])
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
def call(self, hidden_states, training=False):
input = hidden_states
......@@ -245,6 +275,9 @@ class TFConvNextStage(tf.keras.layers.Layer):
)
for j in range(depth)
]
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
def call(self, hidden_states):
for layer in self.downsampling_layer:
......@@ -253,6 +286,20 @@ class TFConvNextStage(tf.keras.layers.Layer):
hidden_states = layer(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
if self.in_channels != self.out_channels or self.stride > 1:
with tf.name_scope(self.downsampling_layer[0].name):
self.downsampling_layer[0].build([None, None, None, self.in_channels])
with tf.name_scope(self.downsampling_layer[1].name):
self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -293,6 +340,11 @@ class TFConvNextEncoder(tf.keras.layers.Layer):
return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
def build(self, input_shape=None):
for stage in self.stages:
with tf.name_scope(stage.name):
stage.build(None)
@keras_serializable
class TFConvNextMainLayer(tf.keras.layers.Layer):
......@@ -353,6 +405,20 @@ class TFConvNextMainLayer(tf.keras.layers.Layer):
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, self.config.hidden_sizes[-1]])
class TFConvNextPreTrainedModel(TFPreTrainedModel):
"""
......@@ -485,6 +551,14 @@ class TFConvNextModel(TFConvNextPreTrainedModel):
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnext", None) is not None:
with tf.name_scope(self.convnext.name):
self.convnext.build(None)
@add_start_docstrings(
"""
......@@ -507,6 +581,7 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas
bias_initializer="zeros",
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
......@@ -577,3 +652,15 @@ class TFConvNextForImageClassification(TFConvNextPreTrainedModel, TFSequenceClas
logits=logits,
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnext", None) is not None:
with tf.name_scope(self.convnext.name):
self.convnext.build(None)
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
......@@ -133,6 +133,7 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
)
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
self.num_channels = config.num_channels
self.config = config
def call(self, pixel_values):
if isinstance(pixel_values, dict):
......@@ -153,6 +154,17 @@ class TFConvNextV2Embeddings(tf.keras.layers.Layer):
embeddings = self.layernorm(embeddings)
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build([None, None, None, self.config.num_channels])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])
class TFConvNextV2Layer(tf.keras.layers.Layer):
"""This corresponds to the `Block` class in the original implementation.
......@@ -223,6 +235,29 @@ class TFConvNextV2Layer(tf.keras.layers.Layer):
x = input + x
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dwconv", None) is not None:
with tf.name_scope(self.dwconv.name):
self.dwconv.build([None, None, None, self.dim])
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, None, self.dim])
if getattr(self, "pwconv1", None) is not None:
with tf.name_scope(self.pwconv1.name):
self.pwconv1.build([None, None, self.dim])
if getattr(self, "grn", None) is not None:
with tf.name_scope(self.grn.name):
self.grn.build(None)
if getattr(self, "pwconv2", None) is not None:
with tf.name_scope(self.pwconv2.name):
self.pwconv2.build([None, None, 4 * self.dim])
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2
class TFConvNextV2Stage(tf.keras.layers.Layer):
......@@ -286,6 +321,9 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
)
for j in range(depth)
]
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
def call(self, hidden_states):
for layer in self.downsampling_layer:
......@@ -294,6 +332,20 @@ class TFConvNextV2Stage(tf.keras.layers.Layer):
hidden_states = layer(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
if self.in_channels != self.out_channels or self.stride > 1:
with tf.name_scope(self.downsampling_layer[0].name):
self.downsampling_layer[0].build([None, None, None, self.in_channels])
with tf.name_scope(self.downsampling_layer[1].name):
self.downsampling_layer[1].build([None, None, None, self.in_channels])
class TFConvNextV2Encoder(tf.keras.layers.Layer):
def __init__(self, config: ConvNextV2Config, **kwargs):
......@@ -339,6 +391,11 @@ class TFConvNextV2Encoder(tf.keras.layers.Layer):
return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
def build(self, input_shape=None):
for stage in self.stages:
with tf.name_scope(stage.name):
stage.build(None)
@keras_serializable
class TFConvNextV2MainLayer(tf.keras.layers.Layer):
......@@ -401,6 +458,20 @@ class TFConvNextV2MainLayer(tf.keras.layers.Layer):
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, self.config.hidden_sizes[-1]])
class TFConvNextV2PreTrainedModel(TFPreTrainedModel):
"""
......@@ -519,6 +590,14 @@ class TFConvNextV2Model(TFConvNextV2PreTrainedModel):
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnextv2", None) is not None:
with tf.name_scope(self.convnextv2.name):
self.convnextv2.build(None)
@add_start_docstrings(
"""
......@@ -593,3 +672,14 @@ class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequence
logits=logits,
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convnextv2", None) is not None:
with tf.name_scope(self.convnextv2.name):
self.convnextv2.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
......@@ -142,6 +142,23 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "Wq", None) is not None:
with tf.name_scope(self.Wq.name):
self.Wq.build([None, None, self.d_model_size])
if getattr(self, "Wk", None) is not None:
with tf.name_scope(self.Wk.name):
self.Wk.build([None, None, self.d_model_size])
if getattr(self, "Wv", None) is not None:
with tf.name_scope(self.Wv.name):
self.Wv.build([None, None, self.d_model_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.d_model_size])
class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
def __init__(self, d_model_size, dff, **kwargs):
......@@ -149,6 +166,8 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
self.dense_0 = tf.keras.layers.Dense(dff, activation="relu", name="0")
self.dense_2 = tf.keras.layers.Dense(d_model_size, name="2")
self.d_model_size = d_model_size
self.dff = dff
def call(self, inputs, trainable=False):
dense_0_output = self.dense_0(inputs)
......@@ -156,6 +175,17 @@ class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer):
return dense_2_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense_0", None) is not None:
with tf.name_scope(self.dense_0.name):
self.dense_0.build([None, None, self.d_model_size])
if getattr(self, "dense_2", None) is not None:
with tf.name_scope(self.dense_2.name):
self.dense_2.build([None, None, self.dff])
class TFEncoderLayer(tf.keras.layers.Layer):
def __init__(
......@@ -175,6 +205,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
self.d_model_size = d_model_size
def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
normed = self.layernorm1(x)
......@@ -202,6 +233,23 @@ class TFEncoderLayer(tf.keras.layers.Layer):
outputs = (out2,) + attn_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "multi_head_attention", None) is not None:
with tf.name_scope(self.multi_head_attention.name):
self.multi_head_attention.build(None)
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build(None)
if getattr(self, "layernorm1", None) is not None:
with tf.name_scope(self.layernorm1.name):
self.layernorm1.build([None, None, self.d_model_size])
if getattr(self, "layernorm2", None) is not None:
with tf.name_scope(self.layernorm2.name):
self.layernorm2.build([None, None, self.d_model_size])
@keras_serializable
class TFCTRLMainLayer(tf.keras.layers.Layer):
......@@ -396,6 +444,21 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
attentions=all_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "w", None) is not None:
with tf.name_scope(self.w.name):
self.w.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.n_embd])
if getattr(self, "h", None) is not None:
for layer in self.h:
with tf.name_scope(layer.name):
layer.build(None)
class TFCTRLPreTrainedModel(TFPreTrainedModel):
"""
......@@ -563,6 +626,14 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFCTRLBiasLayer(tf.keras.layers.Layer):
"""
......@@ -710,6 +781,17 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
@add_start_docstrings(
"""
......@@ -737,6 +819,7 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
use_bias=False,
)
self.transformer = TFCTRLMainLayer(config, name="transformer")
self.config = config
def get_output_embeddings(self):
# Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
......@@ -836,3 +919,14 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.n_embd])
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
......@@ -107,6 +107,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
self,
config: CvtConfig,
patch_size: int,
num_channels: int,
embed_dim: int,
stride: int,
padding: int,
......@@ -117,6 +118,7 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
self.convolution_embeddings = TFCvtConvEmbeddings(
config,
patch_size=patch_size,
num_channels=num_channels,
embed_dim=embed_dim,
stride=stride,
padding=padding,
......@@ -129,11 +131,28 @@ class TFCvtEmbeddings(tf.keras.layers.Layer):
hidden_state = self.dropout(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_embeddings", None) is not None:
with tf.name_scope(self.convolution_embeddings.name):
self.convolution_embeddings.build(None)
class TFCvtConvEmbeddings(tf.keras.layers.Layer):
"""Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
def __init__(self, config: CvtConfig, patch_size: int, embed_dim: int, stride: int, padding: int, **kwargs):
def __init__(
self,
config: CvtConfig,
patch_size: int,
num_channels: int,
embed_dim: int,
stride: int,
padding: int,
**kwargs,
):
super().__init__(**kwargs)
self.padding = tf.keras.layers.ZeroPadding2D(padding=padding)
self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
......@@ -148,6 +167,8 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
)
# Using the same default epsilon as PyTorch
self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
self.num_channels = num_channels
self.embed_dim = embed_dim
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
if isinstance(pixel_values, dict):
......@@ -165,6 +186,17 @@ class TFCvtConvEmbeddings(tf.keras.layers.Layer):
pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
return pixel_values
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, self.embed_dim])
class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
"""Convolutional projection layer."""
......@@ -184,12 +216,24 @@ class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer):
)
# Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution(self.padding(hidden_state))
hidden_state = self.normalization(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution", None) is not None:
with tf.name_scope(self.convolution.name):
self.convolution.build([None, None, None, self.embed_dim])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.embed_dim])
class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer):
"""Linear projection layer used to flatten tokens into 1D."""
......@@ -227,6 +271,14 @@ class TFCvtSelfAttentionProjection(tf.keras.layers.Layer):
hidden_state = self.linear_projection(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_projection", None) is not None:
with tf.name_scope(self.convolution_projection.name):
self.convolution_projection.build(None)
class TFCvtSelfAttention(tf.keras.layers.Layer):
"""
......@@ -348,6 +400,29 @@ class TFCvtSelfAttention(tf.keras.layers.Layer):
context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
return context
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_projection_query", None) is not None:
with tf.name_scope(self.convolution_projection_query.name):
self.convolution_projection_query.build(None)
if getattr(self, "convolution_projection_key", None) is not None:
with tf.name_scope(self.convolution_projection_key.name):
self.convolution_projection_key.build(None)
if getattr(self, "convolution_projection_value", None) is not None:
with tf.name_scope(self.convolution_projection_value.name):
self.convolution_projection_value.build(None)
if getattr(self, "projection_query", None) is not None:
with tf.name_scope(self.projection_query.name):
self.projection_query.build([None, None, self.embed_dim])
if getattr(self, "projection_key", None) is not None:
with tf.name_scope(self.projection_key.name):
self.projection_key.build([None, None, self.embed_dim])
if getattr(self, "projection_value", None) is not None:
with tf.name_scope(self.projection_value.name):
self.projection_value.build([None, None, self.embed_dim])
class TFCvtSelfOutput(tf.keras.layers.Layer):
"""Output of the Attention layer ."""
......@@ -358,12 +433,21 @@ class TFCvtSelfOutput(tf.keras.layers.Layer):
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.dense(inputs=hidden_state)
hidden_state = self.dropout(inputs=hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.embed_dim])
class TFCvtAttention(tf.keras.layers.Layer):
"""Attention layer. First chunk of the convolutional transformer block."""
......@@ -411,6 +495,17 @@ class TFCvtAttention(tf.keras.layers.Layer):
attention_output = self.dense_output(self_output, training=training)
return attention_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFCvtIntermediate(tf.keras.layers.Layer):
"""Intermediate dense layer. Second chunk of the convolutional transformer block."""
......@@ -423,23 +518,34 @@ class TFCvtIntermediate(tf.keras.layers.Layer):
activation="gelu",
name="dense",
)
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
hidden_state = self.dense(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.embed_dim])
class TFCvtOutput(tf.keras.layers.Layer):
"""
Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
"""
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: int, **kwargs):
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim
self.mlp_ratio = mlp_ratio
def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.dense(inputs=hidden_state)
......@@ -447,6 +553,14 @@ class TFCvtOutput(tf.keras.layers.Layer):
hidden_state = hidden_state + input_tensor
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)])
class TFCvtLayer(tf.keras.layers.Layer):
"""
......@@ -492,7 +606,7 @@ class TFCvtLayer(tf.keras.layers.Layer):
name="attention",
)
self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
self.dense_output = TFCvtOutput(config, embed_dim, drop_rate, name="output")
self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output")
# Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour.
self.drop_path = (
TFCvtDropPath(drop_path_rate, name="drop_path")
......@@ -502,6 +616,7 @@ class TFCvtLayer(tf.keras.layers.Layer):
# Using the same default epsilon as PyTorch
self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
# in Cvt, layernorm is applied before self-attention
......@@ -520,6 +635,29 @@ class TFCvtLayer(tf.keras.layers.Layer):
layer_output = self.drop_path(layer_output, training=training)
return layer_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.embed_dim])
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.embed_dim])
class TFCvtStage(tf.keras.layers.Layer):
"""
......@@ -548,6 +686,7 @@ class TFCvtStage(tf.keras.layers.Layer):
self.embedding = TFCvtEmbeddings(
self.config,
patch_size=config.patch_sizes[self.stage],
num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
stride=config.patch_stride[self.stage],
embed_dim=config.embed_dim[self.stage],
padding=config.patch_padding[self.stage],
......@@ -603,6 +742,18 @@ class TFCvtStage(tf.keras.layers.Layer):
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
return hidden_state, cls_token
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding", None) is not None:
with tf.name_scope(self.embedding.name):
self.embedding.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFCvtEncoder(tf.keras.layers.Layer):
"""
......@@ -655,6 +806,15 @@ class TFCvtEncoder(tf.keras.layers.Layer):
hidden_states=all_hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "stages", None) is not None:
for layer in self.stages:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFCvtMainLayer(tf.keras.layers.Layer):
......@@ -696,6 +856,14 @@ class TFCvtMainLayer(tf.keras.layers.Layer):
hidden_states=encoder_outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
class TFCvtPreTrainedModel(TFPreTrainedModel):
"""
......@@ -815,6 +983,14 @@ class TFCvtModel(TFCvtPreTrainedModel):
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "cvt", None) is not None:
with tf.name_scope(self.cvt.name):
self.cvt.build(None)
@add_start_docstrings(
"""
......@@ -840,6 +1016,7 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification
bias_initializer="zeros",
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
......@@ -909,3 +1086,18 @@ class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassification
return ((loss,) + output) if loss is not None else output
return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "cvt", None) is not None:
with tf.name_scope(self.cvt.name):
self.cvt.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.embed_dim[-1]])
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.embed_dim[-1]])
......@@ -84,7 +84,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.dropout)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -99,7 +99,12 @@ class TFEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.dim])
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
"""
......@@ -152,6 +157,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
)
self.pruned_heads = set()
self.config = config
def prune_heads(self, heads):
raise NotImplementedError
......@@ -212,6 +218,23 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
else:
return (context,)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_lin", None) is not None:
with tf.name_scope(self.q_lin.name):
self.q_lin.build([None, None, self.config.dim])
if getattr(self, "k_lin", None) is not None:
with tf.name_scope(self.k_lin.name):
self.k_lin.build([None, None, self.config.dim])
if getattr(self, "v_lin", None) is not None:
with tf.name_scope(self.v_lin.name):
self.v_lin.build([None, None, self.config.dim])
if getattr(self, "out_lin", None) is not None:
with tf.name_scope(self.out_lin.name):
self.out_lin.build([None, None, self.config.dim])
class TFFFN(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -224,6 +247,7 @@ class TFFFN(tf.keras.layers.Layer):
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
)
self.activation = get_tf_activation(config.activation)
self.config = config
def call(self, input, training=False):
x = self.lin1(input)
......@@ -232,6 +256,17 @@ class TFFFN(tf.keras.layers.Layer):
x = self.dropout(x, training=training)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.config.dim])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.config.hidden_dim])
class TFTransformerBlock(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -253,6 +288,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
self.ffn = TFFFN(config, name="ffn")
self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
self.config = config
def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None
"""
......@@ -281,6 +317,23 @@ class TFTransformerBlock(tf.keras.layers.Layer):
output = (sa_weights,) + output
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "sa_layer_norm", None) is not None:
with tf.name_scope(self.sa_layer_norm.name):
self.sa_layer_norm.build([None, None, self.config.dim])
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build(None)
if getattr(self, "output_layer_norm", None) is not None:
with tf.name_scope(self.output_layer_norm.name):
self.output_layer_norm.build([None, None, self.config.dim])
class TFTransformer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -336,6 +389,15 @@ class TFTransformer(tf.keras.layers.Layer):
last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFDistilBertMainLayer(tf.keras.layers.Layer):
......@@ -412,6 +474,17 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class TFDistilBertPreTrainedModel(TFPreTrainedModel):
......@@ -548,6 +621,14 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
class TFDistilBertLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
......@@ -667,6 +748,23 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "vocab_transform", None) is not None:
with tf.name_scope(self.vocab_transform.name):
self.vocab_transform.build([None, None, self.config.dim])
if getattr(self, "vocab_layer_norm", None) is not None:
with tf.name_scope(self.vocab_layer_norm.name):
self.vocab_layer_norm.build([None, None, self.config.dim])
if getattr(self, "vocab_projector", None) is not None:
with tf.name_scope(self.vocab_projector.name):
self.vocab_projector.build(None)
@add_start_docstrings(
"""
......@@ -691,6 +789,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -746,6 +845,20 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "pre_classifier", None) is not None:
with tf.name_scope(self.pre_classifier.name):
self.pre_classifier.build([None, None, self.config.dim])
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.dim])
@add_start_docstrings(
"""
......@@ -764,6 +877,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -814,6 +928,17 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -837,6 +962,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
......@@ -908,6 +1034,20 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "pre_classifier", None) is not None:
with tf.name_scope(self.pre_classifier.name):
self.pre_classifier.build([None, None, self.config.dim])
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.dim])
@add_start_docstrings(
"""
......@@ -926,6 +1066,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
)
assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2"
self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -991,3 +1132,14 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.dim])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment