Unverified Commit 050e0b44 authored by Matt's avatar Matt Committed by GitHub
Browse files

Proper build() methods for TF (#27794)

* Add a convenience method for building in your own name scope

* Second attempt at auto layer building

* Revert "Second attempt at auto layer building"

This reverts commit e03a3aaecf9ec41a805582b83cbdfe3290a631be.

* Attempt #3

* Revert "Attempt #3"

This reverts commit b9df7a0857560d29b5abbed6127d9e9eca77cf47.

* Add missing attributes that we're going to need later

* Add some attributes we're going to need later

* A fourth attempt! Feel the power flow through you!

* Revert "A fourth attempt! Feel the power flow through you!"

This reverts commit 6bf4aaf3875d6f28485f50187617a4c616c8aff7.

* Add more values we'll need later

* TF refactor that we'll need later

* Revert "TF refactor that we'll need later"

This reverts commit ca07202fb5b7b7436b893baa8d688b4f348ea7b9.

* Revert "Revert "TF refactor that we'll need later""

This reverts commit 1beb0f39f293ed9c27594575e1c849aadeb15c13.

* make fixup

* Attempt five!

* Revert "Attempt five!"

This reverts commit 3302207958dfd0374b0447a51c06eea51a506044.

* Attempt six - this time don't add empty methods

* Revert "Attempt six - this time don't add empty methods"

This reverts commit 67d60129be75416b6beb8f47c7d38d77b18d79bb.

* Attempt seven - better base model class detection!

* Revert "Attempt seven - better base model class detection!"

This reverts commit 5f14845e92ea0e87c598da933bfbfee10f553bc9.

* Another attribute we'll need later

* Try again with the missing attribute!

* Revert "Try again with the missing attribute!"

This reverts commit 760c6f30c5dffb3e04b0e73c34a77d1882a0fef7.

* This is the attempt that will pierce the heavens!

* Revert "This is the attempt that will pierce the heavens!"

This reverts commit c868bb657de057aca7a5260350a3f831fc4dfee6.

* Attempt seven - snag list is steadily decreasing

* Revert "Attempt seven - snag list is steadily decreasing"

This reverts commit 46fbd975deda64429bfb3e5fac4fc0370c00d316.

* Attempt eight - will an empty snag list do it?

* Revert "Attempt eight - will an empty snag list do it?"

This reverts commit 7c8a3c2b083253649569e9877e02054ae5cec67b.

* Fixes to Hubert issues that cause problems later

* Trying again with Conv1D/SeparableConv fixes

* Revert "Trying again with Conv1D/SeparableConv fixes"

This reverts commit 55092bca952bc0f750aa1ffe246a640bf1e2036e.

* Apply the build shape fixes to Wav2Vec2 as well

* One more attempt!

* Revert "One more attempt!"

This reverts commit 5ac3e4cb01b9458cc93312873725f9444ae7261c.

* Another attempt!

* Revert "Another attempt!"

This reverts commit ea16d890e019d7de8792a3b8e72f3b1c02adae50.

* Let's see how many failures we get without the internal build method

* Fix OpenAI

* Fix MobileBERT

* (Mostly) fix GroupVIT

* Fix BLIP

* One more BLIP fix

* One more BLIP fix!

* Fix Regnet

* Finally fully fix GroupViT

* Fix Data2Vec and add the new AdaptivePool

* Fix Segformer

* Fix Albert

* Fix Deberta/DebertaV2

* Fix XLM

* Actually fix XLM

* Fix Flaubert

* Fix lxmert

* Fix Resnet

* Fix ConvBERT

* Fix ESM

* Fix Convnext / ConvnextV2

* Fix SAM

* Fix Efficientformer

* Fix LayoutLMv3

* Fix speech_to_text

* Fix mpnet and mobilevit

* Fix Swin

* Fix CTRL

* Fix CVT

* Fix DPR

* Fix Wav2Vec2

* Fix T5

* Fix Hubert

* Fix GPT2

* Fix Whisper

* Fix DeiT

* Fix the encoder-decoder / dual-encoder classes

* make fix-copies

* build in name scope

* Fix summarization test

* Fix tied weight names for BART + Blenderbot

* Fix tied weight name building

* Fix to TFESM weight building

* Update TF SAM

* Expand all the shapes out into Big Boy Shapes
parent 52c37882
......@@ -91,7 +91,7 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -106,7 +106,12 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids):
"""
......@@ -165,6 +170,7 @@ class TFMPNetPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -174,6 +180,14 @@ class TFMPNetPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFMPNetSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -203,6 +217,7 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o"
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, x, batch_size):
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -247,6 +262,23 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
outputs = (o, attention_probs) if output_attentions else (o,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q", None) is not None:
with tf.name_scope(self.q.name):
self.q.build([None, None, self.config.hidden_size])
if getattr(self, "k", None) is not None:
with tf.name_scope(self.k.name):
self.k.build([None, None, self.config.hidden_size])
if getattr(self, "v", None) is not None:
with tf.name_scope(self.v.name):
self.v.build([None, None, self.config.hidden_size])
if getattr(self, "o", None) is not None:
with tf.name_scope(self.o.name):
self.o.build([None, None, self.config.hidden_size])
class TFMPNetAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -255,6 +287,7 @@ class TFMPNetAttention(tf.keras.layers.Layer):
self.attn = TFMPNetSelfAttention(config, name="attn")
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def prune_heads(self, heads):
raise NotImplementedError
......@@ -267,6 +300,17 @@ class TFMPNetAttention(tf.keras.layers.Layer):
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attn", None) is not None:
with tf.name_scope(self.attn.name):
self.attn.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet
class TFMPNetIntermediate(tf.keras.layers.Layer):
......@@ -281,6 +325,7 @@ class TFMPNetIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -288,6 +333,14 @@ class TFMPNetIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet
class TFMPNetOutput(tf.keras.layers.Layer):
......@@ -299,6 +352,7 @@ class TFMPNetOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -307,6 +361,17 @@ class TFMPNetOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFMPNetLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -329,6 +394,20 @@ class TFMPNetLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "out", None) is not None:
with tf.name_scope(self.out.name):
self.out.build(None)
class TFMPNetEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -344,15 +423,20 @@ class TFMPNetEncoder(tf.keras.layers.Layer):
self.layer = [TFMPNetLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
self.relative_attention_num_buckets = config.relative_attention_num_buckets
def build(self, input_shape):
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope("relative_attention_bias"):
self.relative_attention_bias = self.add_weight(
name="embeddings",
shape=[self.relative_attention_num_buckets, self.n_heads],
initializer=get_initializer(self.initializer_range),
)
return super().build(input_shape)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def call(
self,
......@@ -561,6 +645,20 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
MPNET_START_DOCSTRING = r"""
......@@ -693,6 +791,14 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
class TFMPNetLMHead(tf.keras.layers.Layer):
"""MPNet head for masked and permuted language modeling"""
......@@ -712,10 +818,18 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
......@@ -816,6 +930,17 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
class TFMPNetClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
......@@ -832,6 +957,7 @@ class TFMPNetClassificationHead(tf.keras.layers.Layer):
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
......@@ -841,6 +967,17 @@ class TFMPNetClassificationHead(tf.keras.layers.Layer):
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -913,6 +1050,17 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
......@@ -930,6 +1078,7 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
......@@ -999,6 +1148,17 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1019,6 +1179,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1073,6 +1234,17 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1092,6 +1264,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1159,3 +1332,14 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -78,6 +78,7 @@ class TFAttention(tf.keras.layers.Layer):
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.n_state = n_state
self.pruned_heads = set()
def prune_heads(self, heads):
......@@ -153,6 +154,17 @@ class TFAttention(tf.keras.layers.Layer):
outputs = [a] + attn_outputs[1:]
return outputs # a, (attentions)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "c_attn", None) is not None:
with tf.name_scope(self.c_attn.name):
self.c_attn.build([None, None, self.n_state * 3])
if getattr(self, "c_proj", None) is not None:
with tf.name_scope(self.c_proj.name):
self.c_proj.build([None, None, self.n_state])
class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs):
......@@ -162,6 +174,8 @@ class TFMLP(tf.keras.layers.Layer):
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
self.act = get_tf_activation("gelu")
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.nx = nx
self.n_state = n_state
def call(self, x, training=False):
h = self.act(self.c_fc(x))
......@@ -169,6 +183,17 @@ class TFMLP(tf.keras.layers.Layer):
h2 = self.dropout(h2, training=training)
return h2
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "c_fc", None) is not None:
with tf.name_scope(self.c_fc.name):
self.c_fc.build([None, None, self.n_state])
if getattr(self, "c_proj", None) is not None:
with tf.name_scope(self.c_proj.name):
self.c_proj.build([None, None, self.nx])
class TFBlock(tf.keras.layers.Layer):
def __init__(self, config, scale=False, **kwargs):
......@@ -178,6 +203,7 @@ class TFBlock(tf.keras.layers.Layer):
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
self.mlp = TFMLP(4 * nx, config, name="mlp")
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
self.nx = nx
def call(self, x, attention_mask, head_mask, output_attentions, training=False):
output_attn = self.attn(x, attention_mask, head_mask, output_attentions, training=training)
......@@ -190,6 +216,23 @@ class TFBlock(tf.keras.layers.Layer):
outputs = [h] + output_attn[1:]
return outputs # x, (attentions)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attn", None) is not None:
with tf.name_scope(self.attn.name):
self.attn.build(None)
if getattr(self, "ln_1", None) is not None:
with tf.name_scope(self.ln_1.name):
self.ln_1.build([None, None, self.nx])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "ln_2", None) is not None:
with tf.name_scope(self.ln_2.name):
self.ln_2.build([None, None, self.nx])
@keras_serializable
class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
......@@ -213,7 +256,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
def build(self, input_shape):
def build(self, input_shape=None):
with tf.name_scope("positions_embed"):
self.positions_embed = self.add_weight(
name="embeddings",
......@@ -221,7 +264,16 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "tokens_embed", None) is not None:
with tf.name_scope(self.tokens_embed.name):
self.tokens_embed.build(None)
if getattr(self, "h", None) is not None:
for layer in self.h:
with tf.name_scope(layer.name):
layer.build(None)
def get_input_embeddings(self):
return self.tokens_embed
......@@ -528,6 +580,14 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
@add_start_docstrings(
"""
......@@ -613,6 +673,14 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
def prepare_inputs_for_generation(self, inputs, **kwargs):
return {"input_ids": inputs}
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
@add_start_docstrings(
"""
......@@ -734,6 +802,17 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
"mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
}
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "multiple_choice_head", None) is not None:
with tf.name_scope(self.multiple_choice_head.name):
self.multiple_choice_head.build(None)
@add_start_docstrings(
"""
......@@ -761,6 +840,7 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc
use_bias=False,
)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
......@@ -848,3 +928,14 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "score", None) is not None:
with tf.name_scope(self.score.name):
self.score.build([None, None, self.config.n_embd])
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
......@@ -268,6 +268,23 @@ class TFOPTAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFOPTDecoderLayer(tf.keras.layers.Layer):
def __init__(self, config: OPTConfig, **kwargs):
......@@ -288,6 +305,7 @@ class TFOPTDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -354,6 +372,26 @@ class TFOPTDecoderLayer(tf.keras.layers.Layer):
return (hidden_states, self_attn_weights, present_key_value)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
OPT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
......@@ -696,6 +734,30 @@ class TFOPTDecoder(tf.keras.layers.Layer):
attentions=all_self_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "project_out", None) is not None:
with tf.name_scope(self.project_out.name):
self.project_out.build([None, None, self.config.hidden_size])
if getattr(self, "project_in", None) is not None:
with tf.name_scope(self.project_in.name):
self.project_in.build([None, None, self.config.word_embed_proj_dim])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFOPTMainLayer(tf.keras.layers.Layer):
......@@ -757,6 +819,14 @@ class TFOPTMainLayer(tf.keras.layers.Layer):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare TF OPT Model outputting raw hidden-states without any specific head on top.",
......@@ -841,6 +911,14 @@ class TFOPTModel(TFOPTPreTrainedModel):
attentions=attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
@add_start_docstrings(
"""
......@@ -1006,3 +1084,11 @@ class TFOPTForCausalLM(TFOPTPreTrainedModel, TFCausalLanguageModelingLoss):
loss=output.loss,
logits=output.logits,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
......@@ -41,7 +41,6 @@ from ...modeling_tf_utils import (
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ContextManagers,
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
......@@ -330,6 +329,23 @@ class TFPegasusAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Pegasus
class TFPegasusEncoderLayer(tf.keras.layers.Layer):
......@@ -346,6 +362,7 @@ class TFPegasusEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -387,6 +404,26 @@ class TFPegasusEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Pegasus
class TFPegasusDecoderLayer(tf.keras.layers.Layer):
......@@ -416,6 +453,7 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -497,6 +535,32 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer):
present_key_value,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFPegasusPreTrainedModel(TFPreTrainedModel):
config_class = PegasusConfig
......@@ -747,14 +811,6 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
......@@ -812,6 +868,21 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFPegasusDecoder(tf.keras.layers.Layer):
......@@ -953,14 +1024,6 @@ class TFPegasusDecoder(tf.keras.layers.Layer):
positions = self.embed_positions(input_shape, position_ids=position_ids)
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
......@@ -1047,6 +1110,21 @@ class TFPegasusDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFPegasusMainLayer(tf.keras.layers.Layer):
......@@ -1158,6 +1236,22 @@ class TFPegasusMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare PEGASUS Model outputting raw hidden-states without any specific head on top.",
......@@ -1245,6 +1339,14 @@ class TFPegasusModel(TFPegasusPreTrainedModel):
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer):
......@@ -1452,3 +1554,14 @@ class TFPegasusForConditionalGeneration(TFPegasusPreTrainedModel, TFCausalLangua
def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "bias_layer", None) is not None:
with tf.name_scope(self.bias_layer.name):
self.bias_layer.build(None)
......@@ -1292,6 +1292,14 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
return loss
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rag", None) is not None:
with tf.name_scope(self.rag.name):
self.rag.build(None)
@add_start_docstrings_to_model_forward(
"""
......@@ -1743,3 +1751,11 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
output = tf.convert_to_tensor(output)
return tf.cast(output, tensors[0][0][0].dtype)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rag", None) is not None:
with tf.name_scope(self.rag.name):
self.rag.build(None)
......@@ -53,6 +53,7 @@ TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
class TFRegNetConvLayer(tf.keras.layers.Layer):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
......@@ -75,6 +76,8 @@ class TFRegNetConvLayer(tf.keras.layers.Layer):
)
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.activation = ACT2FN[activation] if activation is not None else tf.identity
self.in_channels = in_channels
self.out_channels = out_channels
def call(self, hidden_state):
hidden_state = self.convolution(self.padding(hidden_state))
......@@ -82,6 +85,17 @@ class TFRegNetConvLayer(tf.keras.layers.Layer):
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution", None) is not None:
with tf.name_scope(self.convolution.name):
self.convolution.build([None, None, None, self.in_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.out_channels])
class TFRegNetEmbeddings(tf.keras.layers.Layer):
"""
......@@ -92,6 +106,7 @@ class TFRegNetEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs)
self.num_channels = config.num_channels
self.embedder = TFRegNetConvLayer(
in_channels=config.num_channels,
out_channels=config.embedding_size,
kernel_size=3,
stride=2,
......@@ -113,6 +128,14 @@ class TFRegNetEmbeddings(tf.keras.layers.Layer):
hidden_state = self.embedder(pixel_values)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedder", None) is not None:
with tf.name_scope(self.embedder.name):
self.embedder.build(None)
class TFRegNetShortCut(tf.keras.layers.Layer):
"""
......@@ -120,16 +143,29 @@ class TFRegNetShortCut(tf.keras.layers.Layer):
downsample the input using `stride=2`.
"""
def __init__(self, out_channels: int, stride: int = 2, **kwargs):
def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs):
super().__init__(**kwargs)
self.convolution = tf.keras.layers.Conv2D(
filters=out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution"
)
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.in_channels = in_channels
self.out_channels = out_channels
def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
return self.normalization(self.convolution(inputs), training=training)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution", None) is not None:
with tf.name_scope(self.convolution.name):
self.convolution.build([None, None, None, self.in_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.out_channels])
class TFRegNetSELayer(tf.keras.layers.Layer):
"""
......@@ -143,6 +179,8 @@ class TFRegNetSELayer(tf.keras.layers.Layer):
tf.keras.layers.Conv2D(filters=reduced_channels, kernel_size=1, activation="relu", name="attention.0"),
tf.keras.layers.Conv2D(filters=in_channels, kernel_size=1, activation="sigmoid", name="attention.2"),
]
self.in_channels = in_channels
self.reduced_channels = reduced_channels
def call(self, hidden_state):
# [batch_size, h, w, num_channels] -> [batch_size, 1, 1, num_channels]
......@@ -152,6 +190,19 @@ class TFRegNetSELayer(tf.keras.layers.Layer):
hidden_state = hidden_state * pooled
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build((None, None, None, None))
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention[0].name):
self.attention[0].build([None, None, None, self.in_channels])
with tf.name_scope(self.attention[1].name):
self.attention[1].build([None, None, None, self.reduced_channels])
class TFRegNetXLayer(tf.keras.layers.Layer):
"""
......@@ -163,17 +214,17 @@ class TFRegNetXLayer(tf.keras.layers.Layer):
should_apply_shortcut = in_channels != out_channels or stride != 1
groups = max(1, out_channels // config.groups_width)
self.shortcut = (
TFRegNetShortCut(out_channels, stride=stride, name="shortcut")
TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
if should_apply_shortcut
else tf.keras.layers.Activation("linear", name="shortcut")
)
# `self.layers` instead of `self.layer` because that is a reserved argument.
self.layers = [
TFRegNetConvLayer(out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
TFRegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
TFRegNetConvLayer(
out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
),
TFRegNetConvLayer(out_channels, kernel_size=1, activation=None, name="layer.2"),
TFRegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None, name="layer.2"),
]
self.activation = ACT2FN[config.hidden_act]
......@@ -186,6 +237,18 @@ class TFRegNetXLayer(tf.keras.layers.Layer):
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "shortcut", None) is not None:
with tf.name_scope(self.shortcut.name):
self.shortcut.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFRegNetYLayer(tf.keras.layers.Layer):
"""
......@@ -197,17 +260,17 @@ class TFRegNetYLayer(tf.keras.layers.Layer):
should_apply_shortcut = in_channels != out_channels or stride != 1
groups = max(1, out_channels // config.groups_width)
self.shortcut = (
TFRegNetShortCut(out_channels, stride=stride, name="shortcut")
TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
if should_apply_shortcut
else tf.keras.layers.Activation("linear", name="shortcut")
)
self.layers = [
TFRegNetConvLayer(out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
TFRegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
TFRegNetConvLayer(
out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
),
TFRegNetSELayer(out_channels, reduced_channels=int(round(in_channels / 4)), name="layer.2"),
TFRegNetConvLayer(out_channels, kernel_size=1, activation=None, name="layer.3"),
TFRegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None, name="layer.3"),
]
self.activation = ACT2FN[config.hidden_act]
......@@ -220,6 +283,18 @@ class TFRegNetYLayer(tf.keras.layers.Layer):
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "shortcut", None) is not None:
with tf.name_scope(self.shortcut.name):
self.shortcut.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFRegNetStage(tf.keras.layers.Layer):
"""
......@@ -243,6 +318,15 @@ class TFRegNetStage(tf.keras.layers.Layer):
hidden_state = layer_module(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFRegNetEncoder(tf.keras.layers.Layer):
def __init__(self, config: RegNetConfig, **kwargs):
......@@ -282,6 +366,14 @@ class TFRegNetEncoder(tf.keras.layers.Layer):
return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
for stage in self.stages:
with tf.name_scope(stage.name):
stage.build(None)
@keras_serializable
class TFRegNetMainLayer(tf.keras.layers.Layer):
......@@ -333,6 +425,20 @@ class TFRegNetMainLayer(tf.keras.layers.Layer):
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedder", None) is not None:
with tf.name_scope(self.embedder.name):
self.embedder.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build((None, None, None, None))
class TFRegNetPreTrainedModel(TFPreTrainedModel):
"""
......@@ -418,6 +524,14 @@ class TFRegNetModel(TFRegNetPreTrainedModel):
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "regnet", None) is not None:
with tf.name_scope(self.regnet.name):
self.regnet.build(None)
@add_start_docstrings(
"""
......@@ -479,3 +593,14 @@ class TFRegNetForImageClassification(TFRegNetPreTrainedModel, TFSequenceClassifi
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "regnet", None) is not None:
with tf.name_scope(self.regnet.name):
self.regnet.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier[1].name):
self.classifier[1].build([None, None, None, self.config.hidden_sizes[-1]])
......@@ -80,7 +80,7 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -102,7 +102,12 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.input_embedding_size])
def call(
self,
......@@ -172,6 +177,7 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -261,6 +267,20 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RemBert
class TFRemBertSelfOutput(tf.keras.layers.Layer):
......@@ -272,6 +292,7 @@ class TFRemBertSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -280,6 +301,17 @@ class TFRemBertSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->RemBert
class TFRemBertAttention(tf.keras.layers.Layer):
......@@ -321,6 +353,17 @@ class TFRemBertAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RemBert
class TFRemBertIntermediate(tf.keras.layers.Layer):
......@@ -335,6 +378,7 @@ class TFRemBertIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -342,6 +386,14 @@ class TFRemBertIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RemBert
class TFRemBertOutput(tf.keras.layers.Layer):
......@@ -353,6 +405,7 @@ class TFRemBertOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -361,6 +414,17 @@ class TFRemBertOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RemBert
class TFRemBertLayer(tf.keras.layers.Layer):
......@@ -448,6 +512,23 @@ class TFRemBertLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
class TFRemBertEncoder(tf.keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
......@@ -524,6 +605,18 @@ class TFRemBertEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding_hidden_mapping_in", None) is not None:
with tf.name_scope(self.embedding_hidden_mapping_in.name):
self.embedding_hidden_mapping_in.build([None, None, self.config.input_embedding_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RemBert
class TFRemBertPooler(tf.keras.layers.Layer):
......@@ -536,6 +629,7 @@ class TFRemBertPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -545,6 +639,14 @@ class TFRemBertPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
......@@ -562,7 +664,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
self.activation = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.decoder = self.add_weight(
name="decoder/weight",
shape=[self.config.vocab_size, self.output_embedding_size],
......@@ -572,7 +674,15 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, self.config.output_embedding_size])
def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self
......@@ -612,6 +722,14 @@ class TFRemBertMLMHead(tf.keras.layers.Layer):
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@keras_serializable
class TFRemBertMainLayer(tf.keras.layers.Layer):
......@@ -800,6 +918,20 @@ class TFRemBertMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFRemBertPreTrainedModel(TFPreTrainedModel):
"""
......@@ -982,6 +1114,14 @@ class TFRemBertModel(TFRemBertPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
@add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLoss):
......@@ -1054,6 +1194,17 @@ class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLos
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
......@@ -1170,6 +1321,17 @@ class TFRemBertForCausalLM(TFRemBertPreTrainedModel, TFCausalLanguageModelingLos
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""
......@@ -1190,6 +1352,7 @@ class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceCla
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1246,6 +1409,17 @@ class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceCla
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1263,6 +1437,7 @@ class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss)
self.classifier = tf.keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
......@@ -1342,6 +1517,17 @@ class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss)
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1361,6 +1547,7 @@ class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassific
self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1415,6 +1602,17 @@ class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassific
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1433,6 +1631,7 @@ class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnswerin
self.qa_outputs = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1501,3 +1700,14 @@ class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnswerin
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -51,7 +51,13 @@ TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
class TFResNetConvLayer(tf.keras.layers.Layer):
def __init__(
self, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu", **kwargs
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
activation: str = "relu",
**kwargs,
) -> None:
super().__init__(**kwargs)
self.pad_value = kernel_size // 2
......@@ -61,6 +67,8 @@ class TFResNetConvLayer(tf.keras.layers.Layer):
# Use same default momentum and epsilon as PyTorch equivalent
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.activation = ACT2FN[activation] if activation is not None else tf.keras.layers.Activation("linear")
self.in_channels = in_channels
self.out_channels = out_channels
def convolution(self, hidden_state: tf.Tensor) -> tf.Tensor:
# Pad to match that done in the PyTorch Conv2D model
......@@ -75,6 +83,17 @@ class TFResNetConvLayer(tf.keras.layers.Layer):
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, None, self.in_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.out_channels])
class TFResNetEmbeddings(tf.keras.layers.Layer):
"""
......@@ -84,6 +103,7 @@ class TFResNetEmbeddings(tf.keras.layers.Layer):
def __init__(self, config: ResNetConfig, **kwargs) -> None:
super().__init__(**kwargs)
self.embedder = TFResNetConvLayer(
config.num_channels,
config.embedding_size,
kernel_size=7,
stride=2,
......@@ -105,6 +125,17 @@ class TFResNetEmbeddings(tf.keras.layers.Layer):
hidden_state = self.pooler(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedder", None) is not None:
with tf.name_scope(self.embedder.name):
self.embedder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFResNetShortCut(tf.keras.layers.Layer):
"""
......@@ -112,13 +143,15 @@ class TFResNetShortCut(tf.keras.layers.Layer):
downsample the input using `stride=2`.
"""
def __init__(self, out_channels: int, stride: int = 2, **kwargs) -> None:
def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs) -> None:
super().__init__(**kwargs)
self.convolution = tf.keras.layers.Conv2D(
out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution"
)
# Use same default momentum and epsilon as PyTorch equivalent
self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.in_channels = in_channels
self.out_channels = out_channels
def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = x
......@@ -126,6 +159,17 @@ class TFResNetShortCut(tf.keras.layers.Layer):
hidden_state = self.normalization(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution", None) is not None:
with tf.name_scope(self.convolution.name):
self.convolution.build([None, None, None, self.in_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.out_channels])
class TFResNetBasicLayer(tf.keras.layers.Layer):
"""
......@@ -137,10 +181,10 @@ class TFResNetBasicLayer(tf.keras.layers.Layer):
) -> None:
super().__init__(**kwargs)
should_apply_shortcut = in_channels != out_channels or stride != 1
self.conv1 = TFResNetConvLayer(out_channels, stride=stride, name="layer.0")
self.conv2 = TFResNetConvLayer(out_channels, activation=None, name="layer.1")
self.conv1 = TFResNetConvLayer(in_channels, out_channels, stride=stride, name="layer.0")
self.conv2 = TFResNetConvLayer(out_channels, out_channels, activation=None, name="layer.1")
self.shortcut = (
TFResNetShortCut(out_channels, stride=stride, name="shortcut")
TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
if should_apply_shortcut
else tf.keras.layers.Activation("linear", name="shortcut")
)
......@@ -155,6 +199,20 @@ class TFResNetBasicLayer(tf.keras.layers.Layer):
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv1", None) is not None:
with tf.name_scope(self.conv1.name):
self.conv1.build(None)
if getattr(self, "conv2", None) is not None:
with tf.name_scope(self.conv2.name):
self.conv2.build(None)
if getattr(self, "shortcut", None) is not None:
with tf.name_scope(self.shortcut.name):
self.shortcut.build(None)
class TFResNetBottleNeckLayer(tf.keras.layers.Layer):
"""
......@@ -176,11 +234,11 @@ class TFResNetBottleNeckLayer(tf.keras.layers.Layer):
super().__init__(**kwargs)
should_apply_shortcut = in_channels != out_channels or stride != 1
reduces_channels = out_channels // reduction
self.conv0 = TFResNetConvLayer(reduces_channels, kernel_size=1, name="layer.0")
self.conv1 = TFResNetConvLayer(reduces_channels, stride=stride, name="layer.1")
self.conv2 = TFResNetConvLayer(out_channels, kernel_size=1, activation=None, name="layer.2")
self.conv0 = TFResNetConvLayer(in_channels, reduces_channels, kernel_size=1, name="layer.0")
self.conv1 = TFResNetConvLayer(reduces_channels, reduces_channels, stride=stride, name="layer.1")
self.conv2 = TFResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None, name="layer.2")
self.shortcut = (
TFResNetShortCut(out_channels, stride=stride, name="shortcut")
TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
if should_apply_shortcut
else tf.keras.layers.Activation("linear", name="shortcut")
)
......@@ -196,6 +254,23 @@ class TFResNetBottleNeckLayer(tf.keras.layers.Layer):
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv0", None) is not None:
with tf.name_scope(self.conv0.name):
self.conv0.build(None)
if getattr(self, "conv1", None) is not None:
with tf.name_scope(self.conv1.name):
self.conv1.build(None)
if getattr(self, "conv2", None) is not None:
with tf.name_scope(self.conv2.name):
self.conv2.build(None)
if getattr(self, "shortcut", None) is not None:
with tf.name_scope(self.shortcut.name):
self.shortcut.build(None)
class TFResNetStage(tf.keras.layers.Layer):
"""
......@@ -221,6 +296,15 @@ class TFResNetStage(tf.keras.layers.Layer):
hidden_state = layer(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "stage_layers", None) is not None:
for layer in self.stage_layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFResNetEncoder(tf.keras.layers.Layer):
def __init__(self, config: ResNetConfig, **kwargs) -> None:
......@@ -264,6 +348,15 @@ class TFResNetEncoder(tf.keras.layers.Layer):
return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "stages", None) is not None:
for layer in self.stages:
with tf.name_scope(layer.name):
layer.build(None)
class TFResNetPreTrainedModel(TFPreTrainedModel):
"""
......@@ -364,6 +457,17 @@ class TFResNetMainLayer(tf.keras.layers.Layer):
hidden_states=hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedder", None) is not None:
with tf.name_scope(self.embedder.name):
self.embedder.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
@add_start_docstrings(
"The bare ResNet model outputting raw features without any specific head on top.",
......@@ -403,6 +507,14 @@ class TFResNetModel(TFResNetPreTrainedModel):
)
return resnet_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "resnet", None) is not None:
with tf.name_scope(self.resnet.name):
self.resnet.build(None)
@add_start_docstrings(
"""
......@@ -422,6 +534,7 @@ class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassifi
if config.num_labels > 0
else tf.keras.layers.Activation("linear", name="classifier.1")
)
self.config = config
def classifier(self, x: tf.Tensor) -> tf.Tensor:
x = tf.keras.layers.Flatten()(x)
......@@ -466,3 +579,14 @@ class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassifi
return (loss,) + output if loss is not None else output
return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "resnet", None) is not None:
with tf.name_scope(self.resnet.name):
self.resnet.build(None)
if getattr(self, "classifier_layer", None) is not None:
with tf.name_scope(self.classifier_layer.name):
self.classifier_layer.build([None, None, self.config.hidden_sizes[-1]])
......@@ -89,7 +89,7 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -111,7 +111,12 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
"""
......@@ -184,6 +189,7 @@ class TFRobertaPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -193,6 +199,14 @@ class TFRobertaPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta
class TFRobertaSelfAttention(tf.keras.layers.Layer):
......@@ -222,6 +236,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -311,6 +326,20 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta
class TFRobertaSelfOutput(tf.keras.layers.Layer):
......@@ -322,6 +351,7 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -330,6 +360,17 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta
class TFRobertaAttention(tf.keras.layers.Layer):
......@@ -371,6 +412,17 @@ class TFRobertaAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta
class TFRobertaIntermediate(tf.keras.layers.Layer):
......@@ -385,6 +437,7 @@ class TFRobertaIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -392,6 +445,14 @@ class TFRobertaIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta
class TFRobertaOutput(tf.keras.layers.Layer):
......@@ -403,6 +464,7 @@ class TFRobertaOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -411,6 +473,17 @@ class TFRobertaOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta
class TFRobertaLayer(tf.keras.layers.Layer):
......@@ -498,6 +571,23 @@ class TFRobertaLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta
class TFRobertaEncoder(tf.keras.layers.Layer):
......@@ -568,6 +658,15 @@ class TFRobertaEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFRobertaMainLayer(tf.keras.layers.Layer):
......@@ -765,6 +864,20 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
class TFRobertaPreTrainedModel(TFPreTrainedModel):
"""
......@@ -946,6 +1059,14 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
class TFRobertaLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling."""
......@@ -965,10 +1086,18 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
......@@ -1076,6 +1205,17 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
......@@ -1198,6 +1338,17 @@ class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLos
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
class TFRobertaClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
......@@ -1217,6 +1368,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
......@@ -1226,6 +1378,17 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1302,6 +1465,17 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
......@@ -1323,6 +1497,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
......@@ -1392,6 +1567,17 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1417,6 +1603,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1475,6 +1662,17 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1495,6 +1693,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1566,3 +1765,14 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -94,7 +94,7 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -116,7 +116,12 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
"""
......@@ -189,6 +194,7 @@ class TFRobertaPreLayerNormPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -198,6 +204,14 @@ class TFRobertaPreLayerNormPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm
class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer):
......@@ -227,6 +241,7 @@ class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -316,6 +331,20 @@ class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
......@@ -325,6 +354,7 @@ class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -333,6 +363,14 @@ class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer):
def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
......@@ -341,6 +379,7 @@ class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer):
self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self")
self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output")
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention.prune_heads
def prune_heads(self, heads):
......@@ -376,6 +415,20 @@ class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer):
def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
......@@ -390,6 +443,7 @@ class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.LayerNorm(inputs=hidden_states)
......@@ -398,6 +452,17 @@ class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer):
def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
......@@ -407,6 +472,7 @@ class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -415,6 +481,14 @@ class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RobertaPreLayerNorm
class TFRobertaPreLayerNormLayer(tf.keras.layers.Layer):
......@@ -502,6 +576,23 @@ class TFRobertaPreLayerNormLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm
class TFRobertaPreLayerNormEncoder(tf.keras.layers.Layer):
......@@ -572,6 +663,15 @@ class TFRobertaPreLayerNormEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFRobertaPreLayerNormMainLayer(tf.keras.layers.Layer):
......@@ -765,6 +865,23 @@ class TFRobertaPreLayerNormMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel):
......@@ -948,6 +1065,14 @@ class TFRobertaPreLayerNormModel(TFRobertaPreLayerNormPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta_prelayernorm", None) is not None:
with tf.name_scope(self.roberta_prelayernorm.name):
self.roberta_prelayernorm.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm
class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
......@@ -968,10 +1093,18 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
......@@ -1085,6 +1218,17 @@ class TFRobertaPreLayerNormForMaskedLM(TFRobertaPreLayerNormPreTrainedModel, TFM
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta_prelayernorm", None) is not None:
with tf.name_scope(self.roberta_prelayernorm.name):
self.roberta_prelayernorm.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss):
......@@ -1214,6 +1358,17 @@ class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFC
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta_prelayernorm", None) is not None:
with tf.name_scope(self.roberta_prelayernorm.name):
self.roberta_prelayernorm.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm
class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
......@@ -1234,6 +1389,7 @@ class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
......@@ -1243,6 +1399,17 @@ class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1322,6 +1489,17 @@ class TFRobertaPreLayerNormForSequenceClassification(
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta_prelayernorm", None) is not None:
with tf.name_scope(self.roberta_prelayernorm.name):
self.roberta_prelayernorm.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
......@@ -1344,6 +1522,7 @@ class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedMode
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
......@@ -1415,6 +1594,17 @@ class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedMode
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta_prelayernorm", None) is not None:
with tf.name_scope(self.roberta_prelayernorm.name):
self.roberta_prelayernorm.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1442,6 +1632,7 @@ class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTraine
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1499,6 +1690,17 @@ class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTraine
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta_prelayernorm", None) is not None:
with tf.name_scope(self.roberta_prelayernorm.name):
self.roberta_prelayernorm.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1521,6 +1723,7 @@ class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedM
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1591,3 +1794,14 @@ class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedM
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta_prelayernorm", None) is not None:
with tf.name_scope(self.roberta_prelayernorm.name):
self.roberta_prelayernorm.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -142,7 +142,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -157,7 +157,12 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
def call(
self,
......@@ -218,6 +223,7 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer):
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.rotary_value = config.rotary_value
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -307,6 +313,20 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer):
return query_layer, key_layer, value_layer
return query_layer, key_layer
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RoFormer
class TFRoFormerSelfOutput(tf.keras.layers.Layer):
......@@ -318,6 +338,7 @@ class TFRoFormerSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -326,6 +347,17 @@ class TFRoFormerSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRoFormerAttention(tf.keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
......@@ -361,6 +393,17 @@ class TFRoFormerAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RoFormer
class TFRoFormerIntermediate(tf.keras.layers.Layer):
......@@ -375,6 +418,7 @@ class TFRoFormerIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -382,6 +426,14 @@ class TFRoFormerIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RoFormer
class TFRoFormerOutput(tf.keras.layers.Layer):
......@@ -393,6 +445,7 @@ class TFRoFormerOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -401,6 +454,17 @@ class TFRoFormerOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRoFormerLayer(tf.keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
......@@ -436,6 +500,20 @@ class TFRoFormerLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "roformer_output", None) is not None:
with tf.name_scope(self.roformer_output.name):
self.roformer_output.build(None)
class TFRoFormerEncoder(tf.keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
......@@ -491,6 +569,18 @@ class TFRoFormerEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config: RoFormerConfig, **kwargs):
......@@ -508,6 +598,7 @@ class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -516,6 +607,17 @@ class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
......@@ -530,10 +632,15 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self.input_embeddings
......@@ -572,6 +679,14 @@ class TFRoFormerMLMHead(tf.keras.layers.Layer):
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@keras_serializable
class TFRoFormerMainLayer(tf.keras.layers.Layer):
......@@ -687,6 +802,20 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "embeddings_project", None) is not None:
with tf.name_scope(self.embeddings_project.name):
self.embeddings_project.build([None, None, self.config.embedding_size])
class TFRoFormerPreTrainedModel(TFPreTrainedModel):
"""
......@@ -834,6 +963,14 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss):
......@@ -904,6 +1041,17 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
......@@ -977,6 +1125,17 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
class TFRoFormerClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
......@@ -996,6 +1155,7 @@ class TFRoFormerClassificationHead(tf.keras.layers.Layer):
self.classifier_act_fn = get_tf_activation(config.hidden_act)
else:
self.classifier_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = hidden_states[:, 0, :] # take <s> token (equiv. to [CLS])
......@@ -1007,6 +1167,17 @@ class TFRoFormerClassificationHead(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1075,6 +1246,17 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
......@@ -1092,6 +1274,7 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
self.classifier = tf.keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
......@@ -1167,6 +1350,20 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1186,6 +1383,7 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1238,6 +1436,17 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1256,6 +1465,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
self.qa_outputs = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1321,3 +1531,14 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roformer", None) is not None:
with tf.name_scope(self.roformer.name):
self.roformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -150,6 +150,14 @@ class TFSamPatchEmbeddings(tf.keras.layers.Layer):
embeddings = self.projection(tf.transpose(pixel_values, perm=[0, 2, 3, 1]))
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
class TFSamMLPBlock(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -157,6 +165,7 @@ class TFSamMLPBlock(tf.keras.layers.Layer):
self.lin1 = tf.keras.layers.Dense(config.mlp_dim, name="lin1")
self.lin2 = tf.keras.layers.Dense(config.hidden_size, name="lin2")
self.act = ACT2FN[config.hidden_act]
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.lin1(hidden_states)
......@@ -164,6 +173,17 @@ class TFSamMLPBlock(tf.keras.layers.Layer):
hidden_states = self.lin2(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.config.hidden_size])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.config.mlp_dim])
class TFSamLayerNorm(tf.keras.layers.Layer):
r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
......@@ -257,6 +277,23 @@ class TFSamAttention(tf.keras.layers.Layer):
return out
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.hidden_size])
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.hidden_size])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.internal_dim])
class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer):
def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False, **kwargs):
......@@ -345,6 +382,35 @@ class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, None, self.hidden_size])
if getattr(self, "cross_attn_token_to_image", None) is not None:
with tf.name_scope(self.cross_attn_token_to_image.name):
self.cross_attn_token_to_image.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, None, self.hidden_size])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "layer_norm3", None) is not None:
with tf.name_scope(self.layer_norm3.name):
self.layer_norm3.build([None, None, None, self.hidden_size])
if getattr(self, "layer_norm4", None) is not None:
with tf.name_scope(self.layer_norm4.name):
self.layer_norm4.build([None, None, None, self.hidden_size])
if getattr(self, "cross_attn_image_to_token", None) is not None:
with tf.name_scope(self.cross_attn_image_to_token.name):
self.cross_attn_image_to_token.build(None)
class TFSamTwoWayTransformer(tf.keras.layers.Layer):
def __init__(self, config: SamMaskDecoderConfig, **kwargs):
......@@ -412,6 +478,20 @@ class TFSamTwoWayTransformer(tf.keras.layers.Layer):
queries = self.layer_norm_final_attn(queries)
return queries, keys, all_attentions
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "final_attn_token_to_image", None) is not None:
with tf.name_scope(self.final_attn_token_to_image.name):
self.final_attn_token_to_image.build(None)
if getattr(self, "layer_norm_final_attn", None) is not None:
with tf.name_scope(self.layer_norm_final_attn.name):
self.layer_norm_final_attn.build([None, None, None, self.config.hidden_size])
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFSamFeedForward(tf.keras.layers.Layer):
def __init__(
......@@ -427,6 +507,8 @@ class TFSamFeedForward(tf.keras.layers.Layer):
for i in range(num_layers - 2)
]
self.sigmoid_output = sigmoid_output
self.hidden_dim = hidden_dim
self.input_dim = input_dim
def call(self, hidden_states):
hidden_states = self.proj_in(hidden_states)
......@@ -439,6 +521,21 @@ class TFSamFeedForward(tf.keras.layers.Layer):
hidden_states = tf.sigmoid(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "proj_in", None) is not None:
with tf.name_scope(self.proj_in.name):
self.proj_in.build([None, None, self.input_dim])
if getattr(self, "proj_out", None) is not None:
with tf.name_scope(self.proj_out.name):
self.proj_out.build([None, None, self.hidden_dim])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build([None, None, self.hidden_dim])
class TFSamMaskDecoder(tf.keras.layers.Layer):
def __init__(self, config: SamMaskDecoderConfig, **kwargs):
......@@ -483,12 +580,30 @@ class TFSamMaskDecoder(tf.keras.layers.Layer):
name="iou_prediction_head",
)
def build(self, input_shape):
def build(self, input_shape=None):
if self.built:
return
self.built = True
self.iou_token = self.add_weight(shape=(1, self.hidden_size), name="iou_token.weight", trainable=True)
self.mask_tokens = self.add_weight(
shape=(self.num_mask_tokens, self.hidden_size), name="mask_tokens.weight", trainable=True
)
super().build(input_shape)
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "upscale_conv1", None) is not None:
with tf.name_scope(self.upscale_conv1.name):
self.upscale_conv1.build([None, self.hidden_size, None, None])
if getattr(self, "upscale_conv2", None) is not None:
with tf.name_scope(self.upscale_conv2.name):
self.upscale_conv2.build([None, self.hidden_size // 4, None, None])
if getattr(self, "upscale_layer_norm", None) is not None:
with tf.name_scope(self.upscale_layer_norm.name):
self.upscale_layer_norm.build(None)
if getattr(self, "iou_prediction_head", None) is not None:
with tf.name_scope(self.iou_prediction_head.name):
self.iou_prediction_head.build(None)
def call(
self,
......@@ -615,6 +730,7 @@ class TFSamMaskEmbedding(tf.keras.layers.Layer):
self.conv3 = tf.keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3")
self.layer_norm1 = TFSamLayerNorm(self.mask_input_channels, config.layer_norm_eps, name="layer_norm1")
self.layer_norm2 = TFSamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps, name="layer_norm2")
self.config = config
def call(self, masks):
masks = tf.transpose(masks, perm=(0, 2, 3, 1)) # Convert to channels-last
......@@ -629,24 +745,21 @@ class TFSamMaskEmbedding(tf.keras.layers.Layer):
dense_embeddings = tf.transpose(dense_embeddings, perm=(0, 3, 1, 2)) # Convert back to channels-first
return dense_embeddings
def build(self, input_shape):
def build(self, input_shape=None):
# This class needs an explicit build method because it isn't called with the standard dummy inputs
conv1_shape = [None, None, None, 1]
conv2_shape = [None, None, None, self.mask_input_channels]
conv3_shape = [None, None, None, self.mask_input_channels * 4]
layer_norm1_shape = [None, None, None, self.mask_input_channels]
layer_norm2_shape = [None, None, None, self.mask_input_channels * 4]
if self.built:
return
self.built = True
with tf.name_scope("conv1"):
self.conv1.build(conv1_shape)
self.conv1.build([None, None, None, 1])
with tf.name_scope("conv2"):
self.conv2.build(conv2_shape)
self.conv2.build([None, None, None, self.mask_input_channels])
with tf.name_scope("conv3"):
self.conv3.build(conv3_shape)
self.conv3.build([None, None, None, self.mask_input_channels * 4])
with tf.name_scope("layer_norm1"):
self.layer_norm1.build(layer_norm1_shape)
self.layer_norm1.build([None, None, None, self.mask_input_channels])
with tf.name_scope("layer_norm2"):
self.layer_norm2.build(layer_norm2_shape)
super().build(input_shape)
self.layer_norm2.build([None, None, None, self.mask_input_channels * 4])
class TFSamPromptEncoder(tf.keras.layers.Layer):
......@@ -664,7 +777,7 @@ class TFSamPromptEncoder(tf.keras.layers.Layer):
self.not_a_point_embed = None
self.config = config
def build(self, input_shape):
def build(self, input_shape=None):
self.no_mask_embed = self.add_weight(
name="no_mask_embed.weight",
shape=(1, self.hidden_size),
......@@ -691,7 +804,13 @@ class TFSamPromptEncoder(tf.keras.layers.Layer):
self.mask_embed.build(
(None, self.config.mask_input_channels, self.config.image_size, self.config.image_size)
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "mask_embed", None) is not None:
with tf.name_scope(self.mask_embed.name):
self.mask_embed.build(None)
def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.Tensor:
"""Embeds point prompts."""
......@@ -812,7 +931,7 @@ class TFSamVisionAttention(tf.keras.layers.Layer):
raise ValueError("Input size must be provided if using relative positional encoding.")
self.config = config
def build(self, input_shape):
def build(self, input_shape=None):
if self.input_size is not None:
# initialize relative positional embeddings
self.rel_pos_h = self.add_weight(
......@@ -821,7 +940,16 @@ class TFSamVisionAttention(tf.keras.layers.Layer):
self.rel_pos_w = self.add_weight(
shape=(2 * self.input_size[1] - 1, self.head_dim), initializer="zeros", name="rel_pos_w"
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "qkv", None) is not None:
with tf.name_scope(self.qkv.name):
self.qkv.build([None, None, self.config.hidden_size])
if getattr(self, "proj", None) is not None:
with tf.name_scope(self.proj.name):
self.proj.build([None, None, self.config.hidden_size])
def get_rel_pos(self, q_size: int, k_size: int, rel_pos: tf.Tensor) -> tf.Tensor:
"""
......@@ -949,6 +1077,7 @@ class TFSamVisionLayer(tf.keras.layers.Layer):
self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
self.mlp = TFSamMLPBlock(config, name="mlp")
self.window_size = window_size
self.config = config
def window_partition(self, hidden_states: tf.Tensor, window_size: int) -> Tuple[tf.Tensor, Tuple[int, int]]:
batch_size, height, width, channel = shape_list(hidden_states)
......@@ -1016,6 +1145,23 @@ class TFSamVisionLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build([None, None, None, self.config.hidden_size])
if getattr(self, "attn", None) is not None:
with tf.name_scope(self.attn.name):
self.attn.build(None)
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build([None, None, None, self.config.hidden_size])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
class TFSamVisionNeck(tf.keras.layers.Layer):
def __init__(self, config: SamVisionConfig, **kwargs):
......@@ -1047,6 +1193,23 @@ class TFSamVisionNeck(tf.keras.layers.Layer):
hidden_states = tf.transpose(hidden_states, perm=[0, 3, 1, 2])
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv1", None) is not None:
with tf.name_scope(self.conv1.name):
self.conv1.build([None, None, None, self.config.hidden_size])
if getattr(self, "layer_norm1", None) is not None:
with tf.name_scope(self.layer_norm1.name):
self.layer_norm1.build(None)
if getattr(self, "conv2", None) is not None:
with tf.name_scope(self.conv2.name):
self.conv2.build([None, None, None, self.config.output_channels])
if getattr(self, "layer_norm2", None) is not None:
with tf.name_scope(self.layer_norm2.name):
self.layer_norm2.build(None)
class TFSamVisionEncoder(tf.keras.layers.Layer):
def __init__(self, config: SamVisionConfig, **kwargs):
......@@ -1069,7 +1232,10 @@ class TFSamVisionEncoder(tf.keras.layers.Layer):
self.neck = TFSamVisionNeck(config, name="neck")
def build(self, input_shape):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if self.config.use_abs_pos:
# Initialize absolute positional embedding with pretrain image size.
self.pos_embed = self.add_weight(
......@@ -1083,7 +1249,16 @@ class TFSamVisionEncoder(tf.keras.layers.Layer):
trainable=True,
name="pos_embed",
)
super().build(input_shape)
if getattr(self, "patch_embed", None) is not None:
with tf.name_scope(self.patch_embed.name):
self.patch_embed.build(None)
if getattr(self, "neck", None) is not None:
with tf.name_scope(self.neck.name):
self.neck.build(None)
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
def get_input_embeddings(self):
return self.patch_embed
......@@ -1463,3 +1638,20 @@ class TFSamModel(TFSamPreTrainedModel):
vision_attentions=attns if self.config.output_attentions else None,
mask_decoder_attentions=output.mask_decoder_attentions if self.config.output_attentions else None,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "shared_image_embedding", None) is not None:
with tf.name_scope(self.shared_image_embedding.name):
self.shared_image_embedding.build(None)
if getattr(self, "vision_encoder", None) is not None:
with tf.name_scope(self.vision_encoder.name):
self.vision_encoder.build(None)
if getattr(self, "prompt_encoder", None) is not None:
with tf.name_scope(self.prompt_encoder.name):
self.prompt_encoder.build(None)
if getattr(self, "mask_decoder", None) is not None:
with tf.name_scope(self.mask_decoder.name):
self.mask_decoder.build(None)
......@@ -79,7 +79,7 @@ class TFSegformerDropPath(tf.keras.layers.Layer):
class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
"""Construct the overlapping patch embeddings."""
def __init__(self, patch_size, stride, hidden_size, **kwargs):
def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs):
super().__init__(**kwargs)
self.padding = tf.keras.layers.ZeroPadding2D(padding=patch_size // 2)
self.proj = tf.keras.layers.Conv2D(
......@@ -87,6 +87,8 @@ class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
self.num_channels = num_channels
self.hidden_size = hidden_size
def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]:
embeddings = self.proj(self.padding(pixel_values))
......@@ -99,6 +101,17 @@ class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
embeddings = self.layer_norm(embeddings)
return embeddings, height, width
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "proj", None) is not None:
with tf.name_scope(self.proj.name):
self.proj.build([None, None, None, self.num_channels])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.hidden_size])
class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer):
"""SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
......@@ -196,18 +209,47 @@ class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer):
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.hidden_size])
if getattr(self, "sr", None) is not None:
with tf.name_scope(self.sr.name):
self.sr.build([None, None, None, self.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.hidden_size])
class TFSegformerSelfOutput(tf.keras.layers.Layer):
def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(hidden_size, name="dense")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.hidden_size = hidden_size
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.hidden_size])
class TFSegformerAttention(tf.keras.layers.Layer):
def __init__(
......@@ -237,6 +279,17 @@ class TFSegformerAttention(tf.keras.layers.Layer):
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFSegformerDWConv(tf.keras.layers.Layer):
def __init__(self, dim: int = 768, **kwargs):
......@@ -244,6 +297,7 @@ class TFSegformerDWConv(tf.keras.layers.Layer):
self.depthwise_convolution = tf.keras.layers.Conv2D(
filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
)
self.dim = dim
def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
batch_size = shape_list(hidden_states)[0]
......@@ -257,6 +311,14 @@ class TFSegformerDWConv(tf.keras.layers.Layer):
hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels))
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "depthwise_convolution", None) is not None:
with tf.name_scope(self.depthwise_convolution.name):
self.depthwise_convolution.build([None, None, None, self.dim])
class TFSegformerMixFFN(tf.keras.layers.Layer):
def __init__(
......@@ -277,6 +339,8 @@ class TFSegformerMixFFN(tf.keras.layers.Layer):
self.intermediate_act_fn = config.hidden_act
self.dense2 = tf.keras.layers.Dense(out_features, name="dense2")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.hidden_features = hidden_features
self.in_features = in_features
def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
hidden_states = self.dense1(hidden_states)
......@@ -287,6 +351,20 @@ class TFSegformerMixFFN(tf.keras.layers.Layer):
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense1", None) is not None:
with tf.name_scope(self.dense1.name):
self.dense1.build([None, None, self.in_features])
if getattr(self, "depthwise_convolution", None) is not None:
with tf.name_scope(self.depthwise_convolution.name):
self.depthwise_convolution.build(None)
if getattr(self, "dense2", None) is not None:
with tf.name_scope(self.dense2.name):
self.dense2.build([None, None, self.hidden_features])
class TFSegformerLayer(tf.keras.layers.Layer):
"""This corresponds to the Block class in the original implementation."""
......@@ -314,6 +392,7 @@ class TFSegformerLayer(tf.keras.layers.Layer):
self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
self.hidden_size = hidden_size
def call(
self,
......@@ -347,6 +426,23 @@ class TFSegformerLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm_1", None) is not None:
with tf.name_scope(self.layer_norm_1.name):
self.layer_norm_1.build([None, None, self.hidden_size])
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm_2", None) is not None:
with tf.name_scope(self.layer_norm_2.name):
self.layer_norm_2.build([None, None, self.hidden_size])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
class TFSegformerEncoder(tf.keras.layers.Layer):
def __init__(self, config: SegformerConfig, **kwargs):
......@@ -363,6 +459,7 @@ class TFSegformerEncoder(tf.keras.layers.Layer):
TFSegformerOverlapPatchEmbeddings(
patch_size=config.patch_sizes[i],
stride=config.strides[i],
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
hidden_size=config.hidden_sizes[i],
name=f"patch_embeddings.{i}",
)
......@@ -449,6 +546,24 @@ class TFSegformerEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norms", None) is not None:
for layer, shape in zip(self.layer_norms, self.config.hidden_sizes):
with tf.name_scope(layer.name):
layer.build([None, None, shape])
if getattr(self, "block", None) is not None:
for block in self.block:
for layer in block:
with tf.name_scope(layer.name):
layer.build(None)
if getattr(self, "embeddings", None) is not None:
for layer in self.embeddings:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFSegformerMainLayer(tf.keras.layers.Layer):
......@@ -509,6 +624,14 @@ class TFSegformerMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
class TFSegformerPreTrainedModel(TFPreTrainedModel):
"""
......@@ -605,6 +728,14 @@ class TFSegformerModel(TFSegformerPreTrainedModel):
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "segformer", None) is not None:
with tf.name_scope(self.segformer.name):
self.segformer.build(None)
@add_start_docstrings(
"""
......@@ -622,6 +753,7 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl
# Classifier head
self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier")
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -668,15 +800,27 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "segformer", None) is not None:
with tf.name_scope(self.segformer.name):
self.segformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
class TFSegformerMLP(tf.keras.layers.Layer):
"""
Linear Embedding.
"""
def __init__(self, config: SegformerConfig, **kwargs):
def __init__(self, input_dim: int, config: SegformerConfig, **kwargs):
super().__init__(**kwargs)
self.proj = tf.keras.layers.Dense(config.decoder_hidden_size, name="proj")
self.input_dim = input_dim
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
height = shape_list(hidden_states)[1]
......@@ -686,6 +830,14 @@ class TFSegformerMLP(tf.keras.layers.Layer):
hidden_states = self.proj(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "proj", None) is not None:
with tf.name_scope(self.proj.name):
self.proj.build([None, None, self.input_dim])
class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
def __init__(self, config: SegformerConfig, **kwargs):
......@@ -693,7 +845,7 @@ class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
# linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
mlps = []
for i in range(config.num_encoder_blocks):
mlp = TFSegformerMLP(config, name=f"linear_c.{i}")
mlp = TFSegformerMLP(config=config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
mlps.append(mlp)
self.mlps = mlps
......@@ -741,6 +893,26 @@ class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
return logits
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "linear_fuse", None) is not None:
with tf.name_scope(self.linear_fuse.name):
self.linear_fuse.build(
[None, None, None, self.config.decoder_hidden_size * self.config.num_encoder_blocks]
)
if getattr(self, "batch_norm", None) is not None:
with tf.name_scope(self.batch_norm.name):
self.batch_norm.build([None, None, None, self.config.decoder_hidden_size])
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, None, self.config.decoder_hidden_size])
if getattr(self, "mlps", None) is not None:
for layer in self.mlps:
with tf.name_scope(layer.name):
layer.build(None)
@add_start_docstrings(
"""SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
......@@ -851,3 +1023,14 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "segformer", None) is not None:
with tf.name_scope(self.segformer.name):
self.segformer.build(None)
if getattr(self, "decode_head", None) is not None:
with tf.name_scope(self.decode_head.name):
self.decode_head.build(None)
......@@ -166,6 +166,15 @@ class TFConv1dSubsampler(tf.keras.layers.Layer):
hidden_states = glu(hidden_states, axis=2) # GLU over the Channel dimension
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv_layers", None) is not None:
for i, layer in enumerate(self.conv_layers):
with tf.name_scope(layer.name):
layer.build([None, None, self.in_channels] if i == 0 else [None, None, self.mid_channels // 2])
class TFSpeech2TextSinusoidalPositionalEmbedding(tf.keras.layers.Layer):
"""This module produces sinusoidal positional embeddings of any length."""
......@@ -379,6 +388,23 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: Speech2TextConfig, **kwargs):
......@@ -394,6 +420,7 @@ class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False
......@@ -434,6 +461,26 @@ class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer):
def __init__(self, config: Speech2TextConfig, **kwargs):
......@@ -463,6 +510,7 @@ class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -546,6 +594,32 @@ class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer):
present_key_value,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFSpeech2TextPreTrainedModel(TFPreTrainedModel):
config_class = Speech2TextConfig
......@@ -870,6 +944,24 @@ class TFSpeech2TextEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build(None)
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFSpeech2TextDecoder(tf.keras.layers.Layer):
......@@ -1092,6 +1184,24 @@ class TFSpeech2TextDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFSpeech2TextMainLayer(tf.keras.layers.Layer):
......@@ -1197,6 +1307,17 @@ class TFSpeech2TextMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare Speech2Text Model outputting raw hidden-states without any specific head on top.",
......@@ -1279,6 +1400,14 @@ class TFSpeech2TextModel(TFSpeech2TextPreTrainedModel):
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
@add_start_docstrings(
"The Speech2Text Model with a language modeling head. Can be used for summarization.",
......@@ -1291,6 +1420,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
self.lm_head = tf.keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head")
# TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate
self.supports_xla_generation = False
self.config = config
def get_encoder(self):
return self.model.encoder
......@@ -1461,6 +1591,17 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.config.d_model])
def tf_to_pt_weight_rename(self, tf_weight):
if tf_weight == "lm_head.weight":
return tf_weight, "model.decoder.embed_tokens.weight"
......
......@@ -283,6 +283,7 @@ class TFSwinEmbeddings(tf.keras.layers.Layer):
self.norm = tf.keras.layers.LayerNormalization(name="norm", epsilon=1e-5)
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
self.config = config
def build(self, input_shape: tf.TensorShape) -> None:
if self.use_mask_token:
......@@ -296,7 +297,19 @@ class TFSwinEmbeddings(tf.keras.layers.Layer):
)
else:
self.position_embeddings = None
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None)
if getattr(self, "norm", None) is not None:
with tf.name_scope(self.norm.name):
self.norm.build([None, None, self.config.embed_dim])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
def call(
self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False
......@@ -381,6 +394,14 @@ class TFSwinPatchEmbeddings(tf.keras.layers.Layer):
embeddings = tf.transpose(embeddings, (0, 2, 1))
return embeddings, output_dimensions
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
class TFSwinPatchMerging(tf.keras.layers.Layer):
"""
......@@ -443,6 +464,17 @@ class TFSwinPatchMerging(tf.keras.layers.Layer):
return input_feature
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "reduction", None) is not None:
with tf.name_scope(self.reduction.name):
self.reduction.build([None, None, 4 * self.dim])
if getattr(self, "norm", None) is not None:
with tf.name_scope(self.norm.name):
self.norm.build([None, None, 4 * self.dim])
class TFSwinDropPath(tf.keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
......@@ -521,7 +553,19 @@ class TFSwinSelfAttention(tf.keras.layers.Layer):
relative_coords = tf.stack([stack_0, stack_1], axis=2)
self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32))
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.all_head_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.all_head_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.all_head_size])
def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size]
......@@ -597,12 +641,24 @@ class TFSwinSelfOutput(tf.keras.layers.Layer):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(dim, name="dense")
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout")
self.dim = dim
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.dim])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
class TFSwinAttention(tf.keras.layers.Layer):
def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
......@@ -631,6 +687,17 @@ class TFSwinAttention(tf.keras.layers.Layer):
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
class TFSwinIntermediate(tf.keras.layers.Layer):
def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
......@@ -640,24 +707,43 @@ class TFSwinIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.dim = dim
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.dim])
class TFSwinOutput(tf.keras.layers.Layer):
def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(dim, name="dense")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, "dropout")
self.config = config
self.dim = dim
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, int(self.config.mlp_ratio * self.dim)])
class TFSwinLayer(tf.keras.layers.Layer):
def __init__(
......@@ -684,6 +770,7 @@ class TFSwinLayer(tf.keras.layers.Layer):
)
self.intermediate = TFSwinIntermediate(config, dim, name="intermediate")
self.swin_output = TFSwinOutput(config, dim, name="output")
self.dim = dim
def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None:
img_mask = tf.zeros((height, width))
......@@ -789,6 +876,29 @@ class TFSwinLayer(tf.keras.layers.Layer):
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
return layer_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.dim])
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.dim])
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "swin_output", None) is not None:
with tf.name_scope(self.swin_output.name):
self.swin_output.build(None)
class TFSwinStage(tf.keras.layers.Layer):
def __init__(
......@@ -861,6 +971,18 @@ class TFSwinStage(tf.keras.layers.Layer):
stage_outputs += layer_outputs[1:]
return stage_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "downsample", None) is not None:
with tf.name_scope(self.downsample.name):
self.downsample.build(None)
if getattr(self, "blocks", None) is not None:
for layer in self.blocks:
with tf.name_scope(layer.name):
layer.build(None)
class TFSwinEncoder(tf.keras.layers.Layer):
def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs):
......@@ -941,6 +1063,15 @@ class TFSwinEncoder(tf.keras.layers.Layer):
reshaped_hidden_states=all_reshaped_hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFSwinPreTrainedModel(TFPreTrainedModel):
"""
......@@ -1160,6 +1291,20 @@ class TFSwinMainLayer(tf.keras.layers.Layer):
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.num_features])
@add_start_docstrings(
"The bare Swin Model transformer outputting raw hidden-states without any specific head on top.",
......@@ -1217,6 +1362,14 @@ class TFSwinModel(TFSwinPreTrainedModel):
return swin_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "swin", None) is not None:
with tf.name_scope(self.swin.name):
self.swin.build(None)
class TFSwinPixelShuffle(tf.keras.layers.Layer):
"""TF layer implementation of torch.nn.PixelShuffle"""
......@@ -1251,6 +1404,7 @@ class TFSwinDecoder(tf.keras.layers.Layer):
filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, strides=1, name="0"
)
self.pixel_shuffle = TFSwinPixelShuffle(config.encoder_stride, name="1")
self.config = config
def call(self, x: tf.Tensor) -> tf.Tensor:
hidden_states = x
......@@ -1262,6 +1416,17 @@ class TFSwinDecoder(tf.keras.layers.Layer):
hidden_states = tf.transpose(hidden_states, (0, 3, 1, 2))
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv2d", None) is not None:
with tf.name_scope(self.conv2d.name):
self.conv2d.build([None, None, None, self.config.hidden_size])
if getattr(self, "pixel_shuffle", None) is not None:
with tf.name_scope(self.pixel_shuffle.name):
self.pixel_shuffle.build(None)
@add_start_docstrings(
"Swin Model with a decoder on top for masked image modeling, as proposed in"
......@@ -1372,6 +1537,17 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "swin", None) is not None:
with tf.name_scope(self.swin.name):
self.swin.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"""
......@@ -1446,3 +1622,15 @@ class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificati
attentions=outputs.attentions,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "swin", None) is not None:
with tf.name_scope(self.swin.name):
self.swin.build(None)
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.swin.num_features])
......@@ -45,7 +45,6 @@ from ...modeling_tf_utils import (
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ContextManagers,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
......@@ -75,16 +74,17 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
class TFT5LayerNorm(tf.keras.layers.Layer):
def __init__(self, epsilon=1e-6, **kwargs):
def __init__(self, hidden_size, epsilon=1e-6, **kwargs):
"""
Construct a layernorm module in the T5 style No bias and no subtraction of mean.
"""
super().__init__(**kwargs)
self.variance_epsilon = epsilon
self.hidden_size = hidden_size
def build(self, input_shape):
"""Build shared word embedding layer"""
self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
self.weight = self.add_weight("weight", shape=(self.hidden_size,), initializer="ones")
super().build(input_shape)
def call(self, hidden_states):
......@@ -110,6 +110,7 @@ class TFT5DenseActDense(tf.keras.layers.Layer):
) # Update init weights as in flax
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
self.act = get_tf_activation(config.dense_act_fn)
self.config = config
def call(self, hidden_states, training=False):
hidden_states = self.wi(hidden_states)
......@@ -118,6 +119,17 @@ class TFT5DenseActDense(tf.keras.layers.Layer):
hidden_states = self.wo(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wi", None) is not None:
with tf.name_scope(self.wi.name):
self.wi.build([None, None, self.config.d_model])
if getattr(self, "wo", None) is not None:
with tf.name_scope(self.wo.name):
self.wo.build([None, None, self.config.d_ff])
class TFT5DenseGatedActDense(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -139,6 +151,7 @@ class TFT5DenseGatedActDense(tf.keras.layers.Layer):
) # Update init weights as in flax
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
self.act = get_tf_activation(config.dense_act_fn)
self.config = config
def call(self, hidden_states, training=False):
hidden_gelu = self.act(self.wi_0(hidden_states))
......@@ -148,6 +161,20 @@ class TFT5DenseGatedActDense(tf.keras.layers.Layer):
hidden_states = self.wo(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wi_0", None) is not None:
with tf.name_scope(self.wi_0.name):
self.wi_0.build([None, None, self.config.d_model])
if getattr(self, "wi_1", None) is not None:
with tf.name_scope(self.wi_1.name):
self.wi_1.build([None, None, self.config.d_model])
if getattr(self, "wo", None) is not None:
with tf.name_scope(self.wo.name):
self.wo.build([None, None, self.config.d_ff])
class TFT5LayerFF(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -157,7 +184,7 @@ class TFT5LayerFF(tf.keras.layers.Layer):
else:
self.DenseReluDense = TFT5DenseActDense(config, name="DenseReluDense")
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def call(self, hidden_states, training=False):
......@@ -166,6 +193,17 @@ class TFT5LayerFF(tf.keras.layers.Layer):
hidden_states = hidden_states + self.dropout(dense_output, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build(None)
if getattr(self, "DenseReluDense", None) is not None:
with tf.name_scope(self.DenseReluDense.name):
self.DenseReluDense.build(None)
class TFT5Attention(tf.keras.layers.Layer):
NEW_ID = itertools.count()
......@@ -218,7 +256,10 @@ class TFT5Attention(tf.keras.layers.Layer):
self.pruned_heads = set()
def build(self, input_shape):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if self.has_relative_attention_bias:
with tf.name_scope("relative_attention_bias"):
self.relative_attention_bias = self.add_weight(
......@@ -226,8 +267,18 @@ class TFT5Attention(tf.keras.layers.Layer):
shape=[self.relative_attention_num_buckets, self.n_heads],
initializer=self.relative_attention_bias_initializer, # Add initializer
)
return super().build(input_shape)
if getattr(self, "q", None) is not None:
with tf.name_scope(self.q.name):
self.q.build([None, None, self.d_model])
if getattr(self, "k", None) is not None:
with tf.name_scope(self.k.name):
self.k.build([None, None, self.d_model])
if getattr(self, "v", None) is not None:
with tf.name_scope(self.v.name):
self.v.build([None, None, self.d_model])
if getattr(self, "o", None) is not None:
with tf.name_scope(self.o.name):
self.o.build([None, None, self.inner_dim])
def prune_heads(self, heads):
raise NotImplementedError
......@@ -439,7 +490,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
has_relative_attention_bias=has_relative_attention_bias,
name="SelfAttention",
)
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def call(
......@@ -468,6 +519,17 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "SelfAttention", None) is not None:
with tf.name_scope(self.SelfAttention.name):
self.SelfAttention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build(None)
class TFT5LayerCrossAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -477,7 +539,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
has_relative_attention_bias=False,
name="EncDecAttention",
)
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def call(
......@@ -510,6 +572,17 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "EncDecAttention", None) is not None:
with tf.name_scope(self.EncDecAttention.name):
self.EncDecAttention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build(None)
class TFT5Block(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
......@@ -613,6 +686,15 @@ class TFT5Block(tf.keras.layers.Layer):
outputs = outputs + (present_key_value_state,) + attention_outputs
return outputs # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
def build(self, input_shape=None):
if self.built:
return
self.built = True
for layer_module in self.layer:
if hasattr(layer_module, "name"):
with tf.name_scope(layer_module.name):
layer_module.build(None)
####################################################
# The full model without a specific pretrained or finetuning head is
......@@ -640,7 +722,9 @@ class TFT5MainLayer(tf.keras.layers.Layer):
TFT5Block(config, has_relative_attention_bias=bool(i == 0), name=f"block_._{i}")
for i in range(config.num_layers)
]
self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
self.final_layer_norm = TFT5LayerNorm(
config.d_model, epsilon=config.layer_norm_epsilon, name="final_layer_norm"
)
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def _prune_heads(self, heads_to_prune):
......@@ -679,14 +763,6 @@ class TFT5MainLayer(tf.keras.layers.Layer):
if inputs_embeds is None:
assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids)
......@@ -846,6 +922,18 @@ class TFT5MainLayer(tf.keras.layers.Layer):
attentions=all_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build(None)
if getattr(self, "block", None) is not None:
for layer in self.block:
with tf.name_scope(layer.name):
layer.build(None)
####################################################
# TFT5PreTrainedModel is a sub-class of tf.keras.Model
......@@ -1221,6 +1309,22 @@ class TFT5Model(TFT5PreTrainedModel):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss):
......@@ -1250,6 +1354,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
self.lm_head = tf.keras.layers.Dense(
config.vocab_size, use_bias=False, name="lm_head", kernel_initializer=lm_head_initializer
) # Update init weights as in flax
self.config = config
def get_output_embeddings(self):
if self.config.tie_word_embeddings:
......@@ -1471,6 +1576,25 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
return self._shift_right(labels)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.config.d_model])
@add_start_docstrings(
"The bare T5 Model transformer outputting encoder's raw hidden-stateswithout any specific head on top.",
......@@ -1549,3 +1673,16 @@ class TFT5EncoderModel(TFT5PreTrainedModel):
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
self.shared.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
......@@ -160,7 +160,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -186,7 +186,12 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def call(
self,
......@@ -279,6 +284,7 @@ class TFTapasSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -368,6 +374,20 @@ class TFTapasSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Tapas
class TFTapasSelfOutput(tf.keras.layers.Layer):
......@@ -379,6 +399,7 @@ class TFTapasSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -387,6 +408,17 @@ class TFTapasSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Tapas
class TFTapasAttention(tf.keras.layers.Layer):
......@@ -428,6 +460,17 @@ class TFTapasAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Tapas
class TFTapasIntermediate(tf.keras.layers.Layer):
......@@ -442,6 +485,7 @@ class TFTapasIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -449,6 +493,14 @@ class TFTapasIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Tapas
class TFTapasOutput(tf.keras.layers.Layer):
......@@ -460,6 +512,7 @@ class TFTapasOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -468,6 +521,17 @@ class TFTapasOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Tapas
class TFTapasLayer(tf.keras.layers.Layer):
......@@ -555,6 +619,23 @@ class TFTapasLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Tapas
class TFTapasEncoder(tf.keras.layers.Layer):
......@@ -625,6 +706,15 @@ class TFTapasEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Tapas
class TFTapasPooler(tf.keras.layers.Layer):
......@@ -637,6 +727,7 @@ class TFTapasPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -646,6 +737,14 @@ class TFTapasPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Tapas
class TFTapasPredictionHeadTransform(tf.keras.layers.Layer):
......@@ -664,6 +763,7 @@ class TFTapasPredictionHeadTransform(tf.keras.layers.Layer):
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -672,6 +772,17 @@ class TFTapasPredictionHeadTransform(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Tapas
class TFTapasLMPredictionHead(tf.keras.layers.Layer):
......@@ -687,10 +798,15 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "transform", None) is not None:
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer:
return self.input_embeddings
......@@ -729,6 +845,14 @@ class TFTapasMLMHead(tf.keras.layers.Layer):
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@keras_serializable
class TFTapasMainLayer(tf.keras.layers.Layer):
......@@ -852,6 +976,20 @@ class TFTapasMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFTapasPreTrainedModel(TFPreTrainedModel):
"""
......@@ -1033,6 +1171,14 @@ class TFTapasModel(TFTapasPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "tapas", None) is not None:
with tf.name_scope(self.tapas.name):
self.tapas.build(None)
@add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
......@@ -1129,6 +1275,17 @@ class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "tapas", None) is not None:
with tf.name_scope(self.tapas.name):
self.tapas.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
class TFTapasComputeTokenLogits(tf.keras.layers.Layer):
def __init__(self, config: TapasConfig, **kwargs):
......@@ -1552,6 +1709,23 @@ class TFTapasForQuestionAnswering(TFTapasPreTrainedModel):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "tapas", None) is not None:
with tf.name_scope(self.tapas.name):
self.tapas.build(None)
if getattr(self, "compute_token_logits", None) is not None:
with tf.name_scope(self.compute_token_logits.name):
self.compute_token_logits.build(None)
if getattr(self, "compute_column_logits", None) is not None:
with tf.name_scope(self.compute_column_logits.name):
self.compute_column_logits.build(None)
if getattr(self, "aggregation_classifier", None) is not None:
with tf.name_scope(self.aggregation_classifier.name):
self.aggregation_classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1570,6 +1744,7 @@ class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassif
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
......@@ -1654,6 +1829,20 @@ class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassif
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "tapas", None) is not None:
with tf.name_scope(self.tapas.name):
self.tapas.build(None)
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
""" TAPAS utilities."""
......
......@@ -684,3 +684,17 @@ class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLos
"Resizing the embedding layers via the TFVisionEncoderDecoderModel directly is not supported. "
"Please use the respective methods of the wrapped objects (model.decoder.resize_token_embeddings(...))"
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "enc_to_dec_proj", None) is not None:
with tf.name_scope(self.enc_to_dec_proj.name):
self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size])
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
......@@ -220,12 +220,26 @@ class TFVisionTextDualEncoderModel(TFPreTrainedModel):
self.visual_projection = Dense(self.projection_dim, use_bias=False, name="visual_projection")
self.text_projection = Dense(self.projection_dim, use_bias=False, name="text_projection")
self.logit_scale = None
self.config = config
def build(self, input_shape=None):
if self.built:
return
self.built = True
# Build in the build() method to make sure the names are right
initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value)
self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale")
super().build(input_shape)
if getattr(self, "visual_projection", None) is not None:
with tf.name_scope(self.visual_projection.name):
self.visual_projection.build([None, None, self.vision_embed_dim])
if getattr(self, "text_projection", None) is not None:
with tf.name_scope(self.text_projection.name):
self.text_projection.build([None, None, self.text_embed_dim])
with tf.name_scope(self.vision_model.name):
self.vision_model.build(None)
with tf.name_scope(self.text_model.name):
self.text_model.build(None)
def tf_to_pt_weight_rename(self, tf_weight):
# Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
......
......@@ -66,7 +66,7 @@ class TFViTEmbeddings(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
num_patches = self.patch_embeddings.num_patches
self.cls_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
......@@ -81,7 +81,12 @@ class TFViTEmbeddings(tf.keras.layers.Layer):
name="position_embeddings",
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None)
def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
"""
......@@ -205,6 +210,14 @@ class TFViTPatchEmbeddings(tf.keras.layers.Layer):
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
class TFViTSelfAttention(tf.keras.layers.Layer):
def __init__(self, config: ViTConfig, **kwargs):
......@@ -231,6 +244,7 @@ class TFViTSelfAttention(tf.keras.layers.Layer):
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -280,6 +294,20 @@ class TFViTSelfAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFViTSelfOutput(tf.keras.layers.Layer):
"""
......@@ -294,6 +322,7 @@ class TFViTSelfOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -301,6 +330,14 @@ class TFViTSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFViTAttention(tf.keras.layers.Layer):
def __init__(self, config: ViTConfig, **kwargs):
......@@ -329,6 +366,17 @@ class TFViTAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFViTIntermediate(tf.keras.layers.Layer):
def __init__(self, config: ViTConfig, **kwargs):
......@@ -342,6 +390,7 @@ class TFViTIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -349,6 +398,14 @@ class TFViTIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFViTOutput(tf.keras.layers.Layer):
def __init__(self, config: ViTConfig, **kwargs):
......@@ -358,6 +415,7 @@ class TFViTOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -366,6 +424,14 @@ class TFViTOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
class TFViTLayer(tf.keras.layers.Layer):
"""This corresponds to the Block class in the timm implementation."""
......@@ -383,6 +449,7 @@ class TFViTLayer(tf.keras.layers.Layer):
self.layernorm_after = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="layernorm_after"
)
self.config = config
def call(
self,
......@@ -416,6 +483,26 @@ class TFViTLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "vit_output", None) is not None:
with tf.name_scope(self.vit_output.name):
self.vit_output.build(None)
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.config.hidden_size])
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.config.hidden_size])
class TFViTEncoder(tf.keras.layers.Layer):
def __init__(self, config: ViTConfig, **kwargs):
......@@ -461,6 +548,15 @@ class TFViTEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFViTMainLayer(tf.keras.layers.Layer):
......@@ -539,6 +635,23 @@ class TFViTMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.hidden_size])
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFViTPreTrainedModel(TFPreTrainedModel):
"""
......@@ -665,6 +778,14 @@ class TFViTModel(TFViTPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vit", None) is not None:
with tf.name_scope(self.vit.name):
self.vit.build(None)
class TFViTPooler(tf.keras.layers.Layer):
def __init__(self, config: ViTConfig, **kwargs):
......@@ -676,6 +797,7 @@ class TFViTPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -685,6 +807,14 @@ class TFViTPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -714,6 +844,7 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
......@@ -764,3 +895,14 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vit", None) is not None:
with tf.name_scope(self.vit.name):
self.vit.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment