Proper build() methods for TF (#27794)

* Add a convenience method for building in your own name scope * Second attempt at auto layer building * Revert "Second attempt at auto layer building" This reverts commit e03a3aaecf9ec41a805582b83cbdfe3290a631be. * Attempt #3 * Revert "Attempt #3" This reverts commit b9df7a0857560d29b5abbed6127d9e9eca77cf47. * Add missing attributes that we're going to need later * Add some attributes we're going to need later * A fourth attempt! Feel the power flow through you! * Revert "A fourth attempt! Feel the power flow through you!" This reverts commit 6bf4aaf3875d6f28485f50187617a4c616c8aff7. * Add more values we'll need later * TF refactor that we'll need later * Revert "TF refactor that we'll need later" This reverts commit ca07202fb5b7b7436b893baa8d688b4f348ea7b9. * Revert "Revert "TF refactor that we'll need later"" This reverts commit 1beb0f39f293ed9c27594575e1c849aadeb15c13. * make fixup * Attempt five! * Revert "Attempt five!" This reverts commit 3302207958dfd0374b0447a51c06eea51a506044. * Attempt six - this time don't add empty methods * Revert "Attempt six - this time don't add empty methods" This reverts commit 67d60129be75416b6beb8f47c7d38d77b18d79bb. * Attempt seven - better base model class detection! * Revert "Attempt seven - better base model class detection!" This reverts commit 5f14845e92ea0e87c598da933bfbfee10f553bc9. * Another attribute we'll need later * Try again with the missing attribute! * Revert "Try again with the missing attribute!" This reverts commit 760c6f30c5dffb3e04b0e73c34a77d1882a0fef7. * This is the attempt that will pierce the heavens! * Revert "This is the attempt that will pierce the heavens!" This reverts commit c868bb657de057aca7a5260350a3f831fc4dfee6. * Attempt seven - snag list is steadily decreasing * Revert "Attempt seven - snag list is steadily decreasing" This reverts commit 46fbd975deda64429bfb3e5fac4fc0370c00d316. * Attempt eight - will an empty snag list do it? * Revert "Attempt eight - will an empty snag list do it?" This reverts commit 7c8a3c2b083253649569e9877e02054ae5cec67b. * Fixes to Hubert issues that cause problems later * Trying again with Conv1D/SeparableConv fixes * Revert "Trying again with Conv1D/SeparableConv fixes" This reverts commit 55092bca952bc0f750aa1ffe246a640bf1e2036e. * Apply the build shape fixes to Wav2Vec2 as well * One more attempt! * Revert "One more attempt!" This reverts commit 5ac3e4cb01b9458cc93312873725f9444ae7261c. * Another attempt! * Revert "Another attempt!" This reverts commit ea16d890e019d7de8792a3b8e72f3b1c02adae50. * Let's see how many failures we get without the internal build method * Fix OpenAI * Fix MobileBERT * (Mostly) fix GroupVIT * Fix BLIP * One more BLIP fix * One more BLIP fix! * Fix Regnet * Finally fully fix GroupViT * Fix Data2Vec and add the new AdaptivePool * Fix Segformer * Fix Albert * Fix Deberta/DebertaV2 * Fix XLM * Actually fix XLM * Fix Flaubert * Fix lxmert * Fix Resnet * Fix ConvBERT * Fix ESM * Fix Convnext / ConvnextV2 * Fix SAM * Fix Efficientformer * Fix LayoutLMv3 * Fix speech_to_text * Fix mpnet and mobilevit * Fix Swin * Fix CTRL * Fix CVT * Fix DPR * Fix Wav2Vec2 * Fix T5 * Fix Hubert * Fix GPT2 * Fix Whisper * Fix DeiT * Fix the encoder-decoder / dual-encoder classes * make fix-copies * build in name scope * Fix summarization test * Fix tied weight names for BART + Blenderbot * Fix tied weight name building * Fix to TFESM weight building * Update TF SAM * Expand all the shapes out into Big Boy Shapes

Proper build() methods for TF (#27794)
* Add a convenience method for building in your own name scope * Second attempt at auto layer building * Revert "Second attempt at auto layer building" This reverts commit e03a3aaecf9ec41a805582b83cbdfe3290a631be. * Attempt #3 * Revert "Attempt #3" This reverts commit b9df7a0857560d29b5abbed6127d9e9eca77cf47. * Add missing attributes that we're going to need later * Add some attributes we're going to need later * A fourth attempt! Feel the power flow through you! * Revert "A fourth attempt! Feel the power flow through you!" This reverts commit 6bf4aaf3875d6f28485f50187617a4c616c8aff7. * Add more values we'll need later * TF refactor that we'll need later * Revert "TF refactor that we'll need later" This reverts commit ca07202fb5b7b7436b893baa8d688b4f348ea7b9. * Revert "Revert "TF refactor that we'll need later"" This reverts commit 1beb0f39f293ed9c27594575e1c849aadeb15c13. * make fixup * Attempt five! * Revert "Attempt five!" This reverts commit 3302207958dfd0374b0447a51c06eea51a506044. * Attempt six - this time don't add empty methods * Revert "Attempt six - this time don't add empty methods" This reverts commit 67d60129be75416b6beb8f47c7d38d77b18d79bb. * Attempt seven - better base model class detection! * Revert "Attempt seven - better base model class detection!" This reverts commit 5f14845e92ea0e87c598da933bfbfee10f553bc9. * Another attribute we'll need later * Try again with the missing attribute! * Revert "Try again with the missing attribute!" This reverts commit 760c6f30c5dffb3e04b0e73c34a77d1882a0fef7. * This is the attempt that will pierce the heavens! * Revert "This is the attempt that will pierce the heavens!" This reverts commit c868bb657de057aca7a5260350a3f831fc4dfee6. * Attempt seven - snag list is steadily decreasing * Revert "Attempt seven - snag list is steadily decreasing" This reverts commit 46fbd975deda64429bfb3e5fac4fc0370c00d316. * Attempt eight - will an empty snag list do it? * Revert "Attempt eight - will an empty snag list do it?" This reverts commit 7c8a3c2b083253649569e9877e02054ae5cec67b. * Fixes to Hubert issues that cause problems later * Trying again with Conv1D/SeparableConv fixes * Revert "Trying again with Conv1D/SeparableConv fixes" This reverts commit 55092bca952bc0f750aa1ffe246a640bf1e2036e. * Apply the build shape fixes to Wav2Vec2 as well * One more attempt! * Revert "One more attempt!" This reverts commit 5ac3e4cb01b9458cc93312873725f9444ae7261c. * Another attempt! * Revert "Another attempt!" This reverts commit ea16d890e019d7de8792a3b8e72f3b1c02adae50. * Let's see how many failures we get without the internal build method * Fix OpenAI * Fix MobileBERT * (Mostly) fix GroupVIT * Fix BLIP * One more BLIP fix * One more BLIP fix! * Fix Regnet * Finally fully fix GroupViT * Fix Data2Vec and add the new AdaptivePool * Fix Segformer * Fix Albert * Fix Deberta/DebertaV2 * Fix XLM * Actually fix XLM * Fix Flaubert * Fix lxmert * Fix Resnet * Fix ConvBERT * Fix ESM * Fix Convnext / ConvnextV2 * Fix SAM * Fix Efficientformer * Fix LayoutLMv3 * Fix speech_to_text * Fix mpnet and mobilevit * Fix Swin * Fix CTRL * Fix CVT * Fix DPR * Fix Wav2Vec2 * Fix T5 * Fix Hubert * Fix GPT2 * Fix Whisper * Fix DeiT * Fix the encoder-decoder / dual-encoder classes * make fix-copies * build in name scope * Fix summarization test * Fix tied weight names for BART + Blenderbot * Fix tied weight name building * Fix to TFESM weight building * Update TF SAM * Expand all the shapes out into Big Boy Shapes
050e0b44 · Matt · GitHub · 52c37882 · 050e0b44 · 050e0b44
Unverified Commit 050e0b44 authored Dec 14, 2023 by Matt Committed by GitHub Dec 14, 2023
20 changed files
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -91,7 +91,7 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
@@ -106,7 +106,12 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
                initializer=get_initializer(initializer_range=self.initializer_range),
            )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])

    def create_position_ids_from_input_ids(self, input_ids):
        """
@@ -165,6 +170,7 @@ class TFMPNetPooler(tf.keras.layers.Layer):
            activation="tanh",
            name="dense",
        )
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
@@ -174,6 +180,14 @@ class TFMPNetPooler(tf.keras.layers.Layer):

        return pooled_output

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 class TFMPNetSelfAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -203,6 +217,7 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o"
        )
        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.config = config

    def transpose_for_scores(self, x, batch_size):
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -247,6 +262,23 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
        outputs = (o, attention_probs) if output_attentions else (o,)
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q", None) is not None:
+            with tf.name_scope(self.q.name):
+                self.q.build([None, None, self.config.hidden_size])
+        if getattr(self, "k", None) is not None:
+            with tf.name_scope(self.k.name):
+                self.k.build([None, None, self.config.hidden_size])
+        if getattr(self, "v", None) is not None:
+            with tf.name_scope(self.v.name):
+                self.v.build([None, None, self.config.hidden_size])
+        if getattr(self, "o", None) is not None:
+            with tf.name_scope(self.o.name):
+                self.o.build([None, None, self.config.hidden_size])
+

 class TFMPNetAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -255,6 +287,7 @@ class TFMPNetAttention(tf.keras.layers.Layer):
        self.attn = TFMPNetSelfAttention(config, name="attn")
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.config = config

    def prune_heads(self, heads):
        raise NotImplementedError
@@ -267,6 +300,17 @@ class TFMPNetAttention(tf.keras.layers.Layer):
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attn", None) is not None:
+            with tf.name_scope(self.attn.name):
+                self.attn.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet
 class TFMPNetIntermediate(tf.keras.layers.Layer):
@@ -281,6 +325,7 @@ class TFMPNetIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -288,6 +333,14 @@ class TFMPNetIntermediate(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet
 class TFMPNetOutput(tf.keras.layers.Layer):
@@ -299,6 +352,7 @@ class TFMPNetOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -307,6 +361,17 @@ class TFMPNetOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 class TFMPNetLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -329,6 +394,20 @@ class TFMPNetLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "out", None) is not None:
+            with tf.name_scope(self.out.name):
+                self.out.build(None)
+

 class TFMPNetEncoder(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -344,15 +423,20 @@ class TFMPNetEncoder(tf.keras.layers.Layer):
        self.layer = [TFMPNetLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
        self.relative_attention_num_buckets = config.relative_attention_num_buckets

-    def build(self, input_shape):
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
        with tf.name_scope("relative_attention_bias"):
            self.relative_attention_bias = self.add_weight(
                name="embeddings",
                shape=[self.relative_attention_num_buckets, self.n_heads],
                initializer=get_initializer(self.initializer_range),
            )
-
-        return super().build(input_shape)
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)

    def call(
        self,
@@ -561,6 +645,20 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
            attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+

 MPNET_START_DOCSTRING = r"""

@@ -693,6 +791,14 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
        )
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "mpnet", None) is not None:
+            with tf.name_scope(self.mpnet.name):
+                self.mpnet.build(None)
+

 class TFMPNetLMHead(tf.keras.layers.Layer):
    """MPNet head for masked and permuted language modeling"""
@@ -712,10 +818,18 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
        # an output-only bias for each token.
        self.decoder = input_embeddings

-    def build(self, input_shape):
+    def build(self, input_shape=None):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])

    def get_output_embeddings(self):
        return self.decoder
@@ -816,6 +930,17 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "mpnet", None) is not None:
+            with tf.name_scope(self.mpnet.name):
+                self.mpnet.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+

 class TFMPNetClassificationHead(tf.keras.layers.Layer):
    """Head for sentence-level classification tasks."""
@@ -832,6 +957,7 @@ class TFMPNetClassificationHead(tf.keras.layers.Layer):
        self.out_proj = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
+        self.config = config

    def call(self, features, training=False):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
@@ -841,6 +967,17 @@ class TFMPNetClassificationHead(tf.keras.layers.Layer):
        x = self.out_proj(x)
        return x

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -913,6 +1050,17 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "mpnet", None) is not None:
+            with tf.name_scope(self.mpnet.name):
+                self.mpnet.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+

 @add_start_docstrings(
    """
@@ -930,6 +1078,7 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
        self.classifier = tf.keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@@ -999,6 +1148,17 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "mpnet", None) is not None:
+            with tf.name_scope(self.mpnet.name):
+                self.mpnet.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1019,6 +1179,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
        self.classifier = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1073,6 +1234,17 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "mpnet", None) is not None:
+            with tf.name_scope(self.mpnet.name):
+                self.mpnet.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1092,6 +1264,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
        self.qa_outputs = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1159,3 +1332,14 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "mpnet", None) is not None:
+            with tf.name_scope(self.mpnet.name):
+                self.mpnet.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -78,6 +78,7 @@ class TFAttention(tf.keras.layers.Layer):
        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+        self.n_state = n_state
        self.pruned_heads = set()

    def prune_heads(self, heads):
@@ -153,6 +154,17 @@ class TFAttention(tf.keras.layers.Layer):
        outputs = [a] + attn_outputs[1:]
        return outputs  # a, (attentions)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "c_attn", None) is not None:
+            with tf.name_scope(self.c_attn.name):
+                self.c_attn.build([None, None, self.n_state * 3])
+        if getattr(self, "c_proj", None) is not None:
+            with tf.name_scope(self.c_proj.name):
+                self.c_proj.build([None, None, self.n_state])
+

 class TFMLP(tf.keras.layers.Layer):
    def __init__(self, n_state, config, **kwargs):
@@ -162,6 +174,8 @@ class TFMLP(tf.keras.layers.Layer):
        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
        self.act = get_tf_activation("gelu")
        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+        self.nx = nx
+        self.n_state = n_state

    def call(self, x, training=False):
        h = self.act(self.c_fc(x))
@@ -169,6 +183,17 @@ class TFMLP(tf.keras.layers.Layer):
        h2 = self.dropout(h2, training=training)
        return h2

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "c_fc", None) is not None:
+            with tf.name_scope(self.c_fc.name):
+                self.c_fc.build([None, None, self.n_state])
+        if getattr(self, "c_proj", None) is not None:
+            with tf.name_scope(self.c_proj.name):
+                self.c_proj.build([None, None, self.nx])
+

 class TFBlock(tf.keras.layers.Layer):
    def __init__(self, config, scale=False, **kwargs):
@@ -178,6 +203,7 @@ class TFBlock(tf.keras.layers.Layer):
        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
        self.mlp = TFMLP(4 * nx, config, name="mlp")
        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
+        self.nx = nx

    def call(self, x, attention_mask, head_mask, output_attentions, training=False):
        output_attn = self.attn(x, attention_mask, head_mask, output_attentions, training=training)
@@ -190,6 +216,23 @@ class TFBlock(tf.keras.layers.Layer):
        outputs = [h] + output_attn[1:]
        return outputs  # x, (attentions)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attn", None) is not None:
+            with tf.name_scope(self.attn.name):
+                self.attn.build(None)
+        if getattr(self, "ln_1", None) is not None:
+            with tf.name_scope(self.ln_1.name):
+                self.ln_1.build([None, None, self.nx])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "ln_2", None) is not None:
+            with tf.name_scope(self.ln_2.name):
+                self.ln_2.build([None, None, self.nx])
+

 @keras_serializable
 class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
@@ -213,7 +256,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]

-    def build(self, input_shape):
+    def build(self, input_shape=None):
        with tf.name_scope("positions_embed"):
            self.positions_embed = self.add_weight(
                name="embeddings",
@@ -221,7 +264,16 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
                initializer=get_initializer(self.initializer_range),
            )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "tokens_embed", None) is not None:
+            with tf.name_scope(self.tokens_embed.name):
+                self.tokens_embed.build(None)
+        if getattr(self, "h", None) is not None:
+            for layer in self.h:
+                with tf.name_scope(layer.name):
+                    layer.build(None)

    def get_input_embeddings(self):
        return self.tokens_embed
@@ -528,6 +580,14 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
        )
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+

 @add_start_docstrings(
    """
@@ -613,6 +673,14 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
    def prepare_inputs_for_generation(self, inputs, **kwargs):
        return {"input_ids": inputs}

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+

 @add_start_docstrings(
    """
@@ -734,6 +802,17 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
        }

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "multiple_choice_head", None) is not None:
+            with tf.name_scope(self.multiple_choice_head.name):
+                self.multiple_choice_head.build(None)
+

 @add_start_docstrings(
    """
@@ -761,6 +840,7 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc
            use_bias=False,
        )
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
@@ -848,3 +928,14 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "score", None) is not None:
+            with tf.name_scope(self.score.name):
+                self.score.build([None, None, self.config.n_embd])
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -268,6 +268,23 @@ class TFOPTAttention(tf.keras.layers.Layer):

        return attn_output, attn_weights, past_key_value

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+

 class TFOPTDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, config: OPTConfig, **kwargs):
@@ -288,6 +305,7 @@ class TFOPTDecoderLayer(tf.keras.layers.Layer):
        self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1")
        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config

    def call(
        self,
@@ -354,6 +372,26 @@ class TFOPTDecoderLayer(tf.keras.layers.Layer):

        return (hidden_states, self_attn_weights, present_key_value)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+

 OPT_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -696,6 +734,30 @@ class TFOPTDecoder(tf.keras.layers.Layer):
                attentions=all_self_attns,
            )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_tokens", None) is not None:
+            with tf.name_scope(self.embed_tokens.name):
+                self.embed_tokens.build(None)
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "project_out", None) is not None:
+            with tf.name_scope(self.project_out.name):
+                self.project_out.build([None, None, self.config.hidden_size])
+        if getattr(self, "project_in", None) is not None:
+            with tf.name_scope(self.project_in.name):
+                self.project_in.build([None, None, self.config.word_embed_proj_dim])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFOPTMainLayer(tf.keras.layers.Layer):
@@ -757,6 +819,14 @@ class TFOPTMainLayer(tf.keras.layers.Layer):
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+

 @add_start_docstrings(
    "The bare TF OPT Model outputting raw hidden-states without any specific head on top.",
@@ -841,6 +911,14 @@ class TFOPTModel(TFOPTPreTrainedModel):
            attentions=attns,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+

 @add_start_docstrings(
    """
@@ -1006,3 +1084,11 @@ class TFOPTForCausalLM(TFOPTPreTrainedModel, TFCausalLanguageModelingLoss):
            loss=output.loss,
            logits=output.logits,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -41,7 +41,6 @@ from ...modeling_tf_utils import (
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    ContextManagers,
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
@@ -330,6 +329,23 @@ class TFPegasusAttention(tf.keras.layers.Layer):

        return attn_output, attn_weights, past_key_value

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+

 # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Pegasus
 class TFPegasusEncoderLayer(tf.keras.layers.Layer):
@@ -346,6 +362,7 @@ class TFPegasusEncoderLayer(tf.keras.layers.Layer):
        self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config

    def call(
        self,
@@ -387,6 +404,26 @@ class TFPegasusEncoderLayer(tf.keras.layers.Layer):

        return hidden_states, self_attn_weights

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.encoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+

 # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Pegasus
 class TFPegasusDecoderLayer(tf.keras.layers.Layer):
@@ -416,6 +453,7 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer):
        self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config

    def call(
        self,
@@ -497,6 +535,32 @@ class TFPegasusDecoderLayer(tf.keras.layers.Layer):
            present_key_value,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "encoder_attn", None) is not None:
+            with tf.name_scope(self.encoder_attn.name):
+                self.encoder_attn.build(None)
+        if getattr(self, "encoder_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.encoder_attn_layer_norm.name):
+                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.decoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+

 class TFPegasusPreTrainedModel(TFPreTrainedModel):
    config_class = PegasusConfig
@@ -747,14 +811,6 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
-            # if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
-            # scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
-            # is used with a name ending in `/`, that name replaces the current name scope.
-            # (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
-            context = []
-            if hasattr(self.embed_tokens, "load_weight_prefix"):
-                context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
-            with ContextManagers(context):
            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

@@ -812,6 +868,21 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFPegasusDecoder(tf.keras.layers.Layer):
@@ -953,14 +1024,6 @@ class TFPegasusDecoder(tf.keras.layers.Layer):
            positions = self.embed_positions(input_shape, position_ids=position_ids)

        if inputs_embeds is None:
-            # if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
-            # scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
-            # is used with a name ending in `/`, that name replaces the current name scope.
-            # (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
-            context = []
-            if hasattr(self.embed_tokens, "load_weight_prefix"):
-                context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
-            with ContextManagers(context):
            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

@@ -1047,6 +1110,21 @@ class TFPegasusDecoder(tf.keras.layers.Layer):
                cross_attentions=all_cross_attns,
            )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFPegasusMainLayer(tf.keras.layers.Layer):
@@ -1158,6 +1236,22 @@ class TFPegasusMainLayer(tf.keras.layers.Layer):
            encoder_attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        # The shared/tied weights expect to be in the model base namespace
+        # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
+        # the current one.
+        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
+            self.shared.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+

 @add_start_docstrings(
    "The bare PEGASUS Model outputting raw hidden-states without any specific head on top.",
@@ -1245,6 +1339,14 @@ class TFPegasusModel(TFPegasusPreTrainedModel):
            encoder_attentions=enc_attns,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+

 # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
 class BiasLayer(tf.keras.layers.Layer):
@@ -1452,3 +1554,14 @@ class TFPegasusForConditionalGeneration(TFPegasusPreTrainedModel, TFCausalLangua

    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "bias_layer", None) is not None:
+            with tf.name_scope(self.bias_layer.name):
+                self.bias_layer.build(None)
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -1292,6 +1292,14 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss

        return loss

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rag", None) is not None:
+            with tf.name_scope(self.rag.name):
+                self.rag.build(None)
+

 @add_start_docstrings_to_model_forward(
    """
@@ -1743,3 +1751,11 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL

        output = tf.convert_to_tensor(output)
        return tf.cast(output, tensors[0][0][0].dtype)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rag", None) is not None:
+            with tf.name_scope(self.rag.name):
+                self.rag.build(None)
--- a/src/transformers/models/regnet/modeling_tf_regnet.py
+++ b/src/transformers/models/regnet/modeling_tf_regnet.py
@@ -53,6 +53,7 @@ TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class TFRegNetConvLayer(tf.keras.layers.Layer):
    def __init__(
        self,
+        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
@@ -75,6 +76,8 @@ class TFRegNetConvLayer(tf.keras.layers.Layer):
        )
        self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
        self.activation = ACT2FN[activation] if activation is not None else tf.identity
+        self.in_channels = in_channels
+        self.out_channels = out_channels

    def call(self, hidden_state):
        hidden_state = self.convolution(self.padding(hidden_state))
@@ -82,6 +85,17 @@ class TFRegNetConvLayer(tf.keras.layers.Layer):
        hidden_state = self.activation(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution", None) is not None:
+            with tf.name_scope(self.convolution.name):
+                self.convolution.build([None, None, None, self.in_channels])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, None, self.out_channels])
+

 class TFRegNetEmbeddings(tf.keras.layers.Layer):
    """
@@ -92,6 +106,7 @@ class TFRegNetEmbeddings(tf.keras.layers.Layer):
        super().__init__(**kwargs)
        self.num_channels = config.num_channels
        self.embedder = TFRegNetConvLayer(
+            in_channels=config.num_channels,
            out_channels=config.embedding_size,
            kernel_size=3,
            stride=2,
@@ -113,6 +128,14 @@ class TFRegNetEmbeddings(tf.keras.layers.Layer):
        hidden_state = self.embedder(pixel_values)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedder", None) is not None:
+            with tf.name_scope(self.embedder.name):
+                self.embedder.build(None)
+

 class TFRegNetShortCut(tf.keras.layers.Layer):
    """
@@ -120,16 +143,29 @@ class TFRegNetShortCut(tf.keras.layers.Layer):
    downsample the input using `stride=2`.
    """

-    def __init__(self, out_channels: int, stride: int = 2, **kwargs):
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs):
        super().__init__(**kwargs)
        self.convolution = tf.keras.layers.Conv2D(
            filters=out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution"
        )
        self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
+        self.in_channels = in_channels
+        self.out_channels = out_channels

    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
        return self.normalization(self.convolution(inputs), training=training)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution", None) is not None:
+            with tf.name_scope(self.convolution.name):
+                self.convolution.build([None, None, None, self.in_channels])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, None, self.out_channels])
+

 class TFRegNetSELayer(tf.keras.layers.Layer):
    """
@@ -143,6 +179,8 @@ class TFRegNetSELayer(tf.keras.layers.Layer):
            tf.keras.layers.Conv2D(filters=reduced_channels, kernel_size=1, activation="relu", name="attention.0"),
            tf.keras.layers.Conv2D(filters=in_channels, kernel_size=1, activation="sigmoid", name="attention.2"),
        ]
+        self.in_channels = in_channels
+        self.reduced_channels = reduced_channels

    def call(self, hidden_state):
        # [batch_size, h, w, num_channels] -> [batch_size, 1, 1, num_channels]
@@ -152,6 +190,19 @@ class TFRegNetSELayer(tf.keras.layers.Layer):
        hidden_state = hidden_state * pooled
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build((None, None, None, None))
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention[0].name):
+                self.attention[0].build([None, None, None, self.in_channels])
+            with tf.name_scope(self.attention[1].name):
+                self.attention[1].build([None, None, None, self.reduced_channels])
+

 class TFRegNetXLayer(tf.keras.layers.Layer):
    """
@@ -163,17 +214,17 @@ class TFRegNetXLayer(tf.keras.layers.Layer):
        should_apply_shortcut = in_channels != out_channels or stride != 1
        groups = max(1, out_channels // config.groups_width)
        self.shortcut = (
-            TFRegNetShortCut(out_channels, stride=stride, name="shortcut")
+            TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
            if should_apply_shortcut
            else tf.keras.layers.Activation("linear", name="shortcut")
        )
        # `self.layers` instead of `self.layer` because that is a reserved argument.
        self.layers = [
-            TFRegNetConvLayer(out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
+            TFRegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
            TFRegNetConvLayer(
-                out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
+                out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
            ),
-            TFRegNetConvLayer(out_channels, kernel_size=1, activation=None, name="layer.2"),
+            TFRegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None, name="layer.2"),
        ]
        self.activation = ACT2FN[config.hidden_act]

@@ -186,6 +237,18 @@ class TFRegNetXLayer(tf.keras.layers.Layer):
        hidden_state = self.activation(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "shortcut", None) is not None:
+            with tf.name_scope(self.shortcut.name):
+                self.shortcut.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFRegNetYLayer(tf.keras.layers.Layer):
    """
@@ -197,17 +260,17 @@ class TFRegNetYLayer(tf.keras.layers.Layer):
        should_apply_shortcut = in_channels != out_channels or stride != 1
        groups = max(1, out_channels // config.groups_width)
        self.shortcut = (
-            TFRegNetShortCut(out_channels, stride=stride, name="shortcut")
+            TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
            if should_apply_shortcut
            else tf.keras.layers.Activation("linear", name="shortcut")
        )
        self.layers = [
-            TFRegNetConvLayer(out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
+            TFRegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
            TFRegNetConvLayer(
-                out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
+                out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
            ),
            TFRegNetSELayer(out_channels, reduced_channels=int(round(in_channels / 4)), name="layer.2"),
-            TFRegNetConvLayer(out_channels, kernel_size=1, activation=None, name="layer.3"),
+            TFRegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None, name="layer.3"),
        ]
        self.activation = ACT2FN[config.hidden_act]

@@ -220,6 +283,18 @@ class TFRegNetYLayer(tf.keras.layers.Layer):
        hidden_state = self.activation(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "shortcut", None) is not None:
+            with tf.name_scope(self.shortcut.name):
+                self.shortcut.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFRegNetStage(tf.keras.layers.Layer):
    """
@@ -243,6 +318,15 @@ class TFRegNetStage(tf.keras.layers.Layer):
            hidden_state = layer_module(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFRegNetEncoder(tf.keras.layers.Layer):
    def __init__(self, config: RegNetConfig, **kwargs):
@@ -282,6 +366,14 @@ class TFRegNetEncoder(tf.keras.layers.Layer):

        return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        for stage in self.stages:
+            with tf.name_scope(stage.name):
+                stage.build(None)
+

 @keras_serializable
 class TFRegNetMainLayer(tf.keras.layers.Layer):
@@ -333,6 +425,20 @@ class TFRegNetMainLayer(tf.keras.layers.Layer):
            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedder", None) is not None:
+            with tf.name_scope(self.embedder.name):
+                self.embedder.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build((None, None, None, None))
+

 class TFRegNetPreTrainedModel(TFPreTrainedModel):
    """
@@ -418,6 +524,14 @@ class TFRegNetModel(TFRegNetPreTrainedModel):
            hidden_states=outputs.hidden_states,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "regnet", None) is not None:
+            with tf.name_scope(self.regnet.name):
+                self.regnet.build(None)
+

 @add_start_docstrings(
    """
@@ -479,3 +593,14 @@ class TFRegNetForImageClassification(TFRegNetPreTrainedModel, TFSequenceClassifi
            return ((loss,) + output) if loss is not None else output

        return TFSequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "regnet", None) is not None:
+            with tf.name_scope(self.regnet.name):
+                self.regnet.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier[1].name):
+                self.classifier[1].build([None, None, None, self.config.hidden_sizes[-1]])
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -80,7 +80,7 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
@@ -102,7 +102,12 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
                initializer=get_initializer(self.initializer_range),
            )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.input_embedding_size])

    def call(
        self,
@@ -172,6 +177,7 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer):
        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        self.is_decoder = config.is_decoder
+        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -261,6 +267,20 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer):
            outputs = outputs + (past_key_value,)
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RemBert
 class TFRemBertSelfOutput(tf.keras.layers.Layer):
@@ -272,6 +292,7 @@ class TFRemBertSelfOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -280,6 +301,17 @@ class TFRemBertSelfOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->RemBert
 class TFRemBertAttention(tf.keras.layers.Layer):
@@ -321,6 +353,17 @@ class TFRemBertAttention(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RemBert
 class TFRemBertIntermediate(tf.keras.layers.Layer):
@@ -335,6 +378,7 @@ class TFRemBertIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -342,6 +386,14 @@ class TFRemBertIntermediate(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RemBert
 class TFRemBertOutput(tf.keras.layers.Layer):
@@ -353,6 +405,7 @@ class TFRemBertOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -361,6 +414,17 @@ class TFRemBertOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RemBert
 class TFRemBertLayer(tf.keras.layers.Layer):
@@ -448,6 +512,23 @@ class TFRemBertLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+

 class TFRemBertEncoder(tf.keras.layers.Layer):
    def __init__(self, config: RemBertConfig, **kwargs):
@@ -524,6 +605,18 @@ class TFRemBertEncoder(tf.keras.layers.Layer):
            cross_attentions=all_cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedding_hidden_mapping_in", None) is not None:
+            with tf.name_scope(self.embedding_hidden_mapping_in.name):
+                self.embedding_hidden_mapping_in.build([None, None, self.config.input_embedding_size])
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RemBert
 class TFRemBertPooler(tf.keras.layers.Layer):
@@ -536,6 +629,7 @@ class TFRemBertPooler(tf.keras.layers.Layer):
            activation="tanh",
            name="dense",
        )
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
@@ -545,6 +639,14 @@ class TFRemBertPooler(tf.keras.layers.Layer):

        return pooled_output

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
    def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
@@ -562,7 +664,7 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
            self.activation = config.hidden_act
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        self.decoder = self.add_weight(
            name="decoder/weight",
            shape=[self.config.vocab_size, self.output_embedding_size],
@@ -572,7 +674,15 @@ class TFRemBertLMPredictionHead(tf.keras.layers.Layer):
            shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
        )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, self.config.output_embedding_size])

    def get_output_embeddings(self) -> tf.keras.layers.Layer:
        return self
@@ -612,6 +722,14 @@ class TFRemBertMLMHead(tf.keras.layers.Layer):

        return prediction_scores

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+

 @keras_serializable
 class TFRemBertMainLayer(tf.keras.layers.Layer):
@@ -800,6 +918,20 @@ class TFRemBertMainLayer(tf.keras.layers.Layer):
            cross_attentions=encoder_outputs.cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+

 class TFRemBertPreTrainedModel(TFPreTrainedModel):
    """
@@ -982,6 +1114,14 @@ class TFRemBertModel(TFRemBertPreTrainedModel):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rembert", None) is not None:
+            with tf.name_scope(self.rembert.name):
+                self.rembert.build(None)
+

 @add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
 class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1054,6 +1194,17 @@ class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLos
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rembert", None) is not None:
+            with tf.name_scope(self.rembert.name):
+                self.rembert.build(None)
+        if getattr(self, "mlm", None) is not None:
+            with tf.name_scope(self.mlm.name):
+                self.mlm.build(None)
+

 @add_start_docstrings(
    """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
@@ -1170,6 +1321,17 @@ class TFRemBertForCausalLM(TFRemBertPreTrainedModel, TFCausalLanguageModelingLos
            cross_attentions=outputs.cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rembert", None) is not None:
+            with tf.name_scope(self.rembert.name):
+                self.rembert.build(None)
+        if getattr(self, "mlm", None) is not None:
+            with tf.name_scope(self.mlm.name):
+                self.mlm.build(None)
+

 @add_start_docstrings(
    """
@@ -1190,6 +1352,7 @@ class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceCla
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1246,6 +1409,17 @@ class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceCla
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rembert", None) is not None:
+            with tf.name_scope(self.rembert.name):
+                self.rembert.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1263,6 +1437,7 @@ class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss)
        self.classifier = tf.keras.layers.Dense(
            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@@ -1342,6 +1517,17 @@ class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss)
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rembert", None) is not None:
+            with tf.name_scope(self.rembert.name):
+                self.rembert.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1361,6 +1547,7 @@ class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassific
        self.classifier = tf.keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1415,6 +1602,17 @@ class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassific
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rembert", None) is not None:
+            with tf.name_scope(self.rembert.name):
+                self.rembert.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1433,6 +1631,7 @@ class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnswerin
        self.qa_outputs = tf.keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1501,3 +1700,14 @@ class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnswerin
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rembert", None) is not None:
+            with tf.name_scope(self.rembert.name):
+                self.rembert.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
--- a/src/transformers/models/resnet/modeling_tf_resnet.py
+++ b/src/transformers/models/resnet/modeling_tf_resnet.py
@@ -51,7 +51,13 @@ TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = [

 class TFResNetConvLayer(tf.keras.layers.Layer):
    def __init__(
-        self, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu", **kwargs
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        activation: str = "relu",
+        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.pad_value = kernel_size // 2
@@ -61,6 +67,8 @@ class TFResNetConvLayer(tf.keras.layers.Layer):
        # Use same default momentum and epsilon as PyTorch equivalent
        self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
        self.activation = ACT2FN[activation] if activation is not None else tf.keras.layers.Activation("linear")
+        self.in_channels = in_channels
+        self.out_channels = out_channels

    def convolution(self, hidden_state: tf.Tensor) -> tf.Tensor:
        # Pad to match that done in the PyTorch Conv2D model
@@ -75,6 +83,17 @@ class TFResNetConvLayer(tf.keras.layers.Layer):
        hidden_state = self.activation(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, None, self.in_channels])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, None, self.out_channels])
+

 class TFResNetEmbeddings(tf.keras.layers.Layer):
    """
@@ -84,6 +103,7 @@ class TFResNetEmbeddings(tf.keras.layers.Layer):
    def __init__(self, config: ResNetConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        self.embedder = TFResNetConvLayer(
+            config.num_channels,
            config.embedding_size,
            kernel_size=7,
            stride=2,
@@ -105,6 +125,17 @@ class TFResNetEmbeddings(tf.keras.layers.Layer):
        hidden_state = self.pooler(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedder", None) is not None:
+            with tf.name_scope(self.embedder.name):
+                self.embedder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+

 class TFResNetShortCut(tf.keras.layers.Layer):
    """
@@ -112,13 +143,15 @@ class TFResNetShortCut(tf.keras.layers.Layer):
    downsample the input using `stride=2`.
    """

-    def __init__(self, out_channels: int, stride: int = 2, **kwargs) -> None:
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs) -> None:
        super().__init__(**kwargs)
        self.convolution = tf.keras.layers.Conv2D(
            out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution"
        )
        # Use same default momentum and epsilon as PyTorch equivalent
        self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
+        self.in_channels = in_channels
+        self.out_channels = out_channels

    def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_state = x
@@ -126,6 +159,17 @@ class TFResNetShortCut(tf.keras.layers.Layer):
        hidden_state = self.normalization(hidden_state, training=training)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution", None) is not None:
+            with tf.name_scope(self.convolution.name):
+                self.convolution.build([None, None, None, self.in_channels])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, None, self.out_channels])
+

 class TFResNetBasicLayer(tf.keras.layers.Layer):
    """
@@ -137,10 +181,10 @@ class TFResNetBasicLayer(tf.keras.layers.Layer):
    ) -> None:
        super().__init__(**kwargs)
        should_apply_shortcut = in_channels != out_channels or stride != 1
-        self.conv1 = TFResNetConvLayer(out_channels, stride=stride, name="layer.0")
-        self.conv2 = TFResNetConvLayer(out_channels, activation=None, name="layer.1")
+        self.conv1 = TFResNetConvLayer(in_channels, out_channels, stride=stride, name="layer.0")
+        self.conv2 = TFResNetConvLayer(out_channels, out_channels, activation=None, name="layer.1")
        self.shortcut = (
-            TFResNetShortCut(out_channels, stride=stride, name="shortcut")
+            TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
            if should_apply_shortcut
            else tf.keras.layers.Activation("linear", name="shortcut")
        )
@@ -155,6 +199,20 @@ class TFResNetBasicLayer(tf.keras.layers.Layer):
        hidden_state = self.activation(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv1", None) is not None:
+            with tf.name_scope(self.conv1.name):
+                self.conv1.build(None)
+        if getattr(self, "conv2", None) is not None:
+            with tf.name_scope(self.conv2.name):
+                self.conv2.build(None)
+        if getattr(self, "shortcut", None) is not None:
+            with tf.name_scope(self.shortcut.name):
+                self.shortcut.build(None)
+

 class TFResNetBottleNeckLayer(tf.keras.layers.Layer):
    """
@@ -176,11 +234,11 @@ class TFResNetBottleNeckLayer(tf.keras.layers.Layer):
        super().__init__(**kwargs)
        should_apply_shortcut = in_channels != out_channels or stride != 1
        reduces_channels = out_channels // reduction
-        self.conv0 = TFResNetConvLayer(reduces_channels, kernel_size=1, name="layer.0")
-        self.conv1 = TFResNetConvLayer(reduces_channels, stride=stride, name="layer.1")
-        self.conv2 = TFResNetConvLayer(out_channels, kernel_size=1, activation=None, name="layer.2")
+        self.conv0 = TFResNetConvLayer(in_channels, reduces_channels, kernel_size=1, name="layer.0")
+        self.conv1 = TFResNetConvLayer(reduces_channels, reduces_channels, stride=stride, name="layer.1")
+        self.conv2 = TFResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None, name="layer.2")
        self.shortcut = (
-            TFResNetShortCut(out_channels, stride=stride, name="shortcut")
+            TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
            if should_apply_shortcut
            else tf.keras.layers.Activation("linear", name="shortcut")
        )
@@ -196,6 +254,23 @@ class TFResNetBottleNeckLayer(tf.keras.layers.Layer):
        hidden_state = self.activation(hidden_state)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv0", None) is not None:
+            with tf.name_scope(self.conv0.name):
+                self.conv0.build(None)
+        if getattr(self, "conv1", None) is not None:
+            with tf.name_scope(self.conv1.name):
+                self.conv1.build(None)
+        if getattr(self, "conv2", None) is not None:
+            with tf.name_scope(self.conv2.name):
+                self.conv2.build(None)
+        if getattr(self, "shortcut", None) is not None:
+            with tf.name_scope(self.shortcut.name):
+                self.shortcut.build(None)
+

 class TFResNetStage(tf.keras.layers.Layer):
    """
@@ -221,6 +296,15 @@ class TFResNetStage(tf.keras.layers.Layer):
            hidden_state = layer(hidden_state, training=training)
        return hidden_state

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "stage_layers", None) is not None:
+            for layer in self.stage_layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFResNetEncoder(tf.keras.layers.Layer):
    def __init__(self, config: ResNetConfig, **kwargs) -> None:
@@ -264,6 +348,15 @@ class TFResNetEncoder(tf.keras.layers.Layer):

        return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "stages", None) is not None:
+            for layer in self.stages:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFResNetPreTrainedModel(TFPreTrainedModel):
    """
@@ -364,6 +457,17 @@ class TFResNetMainLayer(tf.keras.layers.Layer):
            hidden_states=hidden_states,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedder", None) is not None:
+            with tf.name_scope(self.embedder.name):
+                self.embedder.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+

 @add_start_docstrings(
    "The bare ResNet model outputting raw features without any specific head on top.",
@@ -403,6 +507,14 @@ class TFResNetModel(TFResNetPreTrainedModel):
        )
        return resnet_outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "resnet", None) is not None:
+            with tf.name_scope(self.resnet.name):
+                self.resnet.build(None)
+

 @add_start_docstrings(
    """
@@ -422,6 +534,7 @@ class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassifi
            if config.num_labels > 0
            else tf.keras.layers.Activation("linear", name="classifier.1")
        )
+        self.config = config

    def classifier(self, x: tf.Tensor) -> tf.Tensor:
        x = tf.keras.layers.Flatten()(x)
@@ -466,3 +579,14 @@ class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassifi
            return (loss,) + output if loss is not None else output

        return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "resnet", None) is not None:
+            with tf.name_scope(self.resnet.name):
+                self.resnet.build(None)
+        if getattr(self, "classifier_layer", None) is not None:
+            with tf.name_scope(self.classifier_layer.name):
+                self.classifier_layer.build([None, None, self.config.hidden_sizes[-1]])
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -89,7 +89,7 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
@@ -111,7 +111,12 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
                initializer=get_initializer(self.initializer_range),
            )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])

    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
        """
@@ -184,6 +189,7 @@ class TFRobertaPooler(tf.keras.layers.Layer):
            activation="tanh",
            name="dense",
        )
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
@@ -193,6 +199,14 @@ class TFRobertaPooler(tf.keras.layers.Layer):

        return pooled_output

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta
 class TFRobertaSelfAttention(tf.keras.layers.Layer):
@@ -222,6 +236,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        self.is_decoder = config.is_decoder
+        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -311,6 +326,20 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
            outputs = outputs + (past_key_value,)
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta
 class TFRobertaSelfOutput(tf.keras.layers.Layer):
@@ -322,6 +351,7 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -330,6 +360,17 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta
 class TFRobertaAttention(tf.keras.layers.Layer):
@@ -371,6 +412,17 @@ class TFRobertaAttention(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta
 class TFRobertaIntermediate(tf.keras.layers.Layer):
@@ -385,6 +437,7 @@ class TFRobertaIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -392,6 +445,14 @@ class TFRobertaIntermediate(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta
 class TFRobertaOutput(tf.keras.layers.Layer):
@@ -403,6 +464,7 @@ class TFRobertaOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -411,6 +473,17 @@ class TFRobertaOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta
 class TFRobertaLayer(tf.keras.layers.Layer):
@@ -498,6 +571,23 @@ class TFRobertaLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta
 class TFRobertaEncoder(tf.keras.layers.Layer):
@@ -568,6 +658,15 @@ class TFRobertaEncoder(tf.keras.layers.Layer):
            cross_attentions=all_cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFRobertaMainLayer(tf.keras.layers.Layer):
@@ -765,6 +864,20 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
            cross_attentions=encoder_outputs.cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+

 class TFRobertaPreTrainedModel(TFPreTrainedModel):
    """
@@ -946,6 +1059,14 @@ class TFRobertaModel(TFRobertaPreTrainedModel):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+

 class TFRobertaLMHead(tf.keras.layers.Layer):
    """Roberta Head for masked language modeling."""
@@ -965,10 +1086,18 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
        # an output-only bias for each token.
        self.decoder = input_embeddings

-    def build(self, input_shape):
+    def build(self, input_shape=None):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])

    def get_output_embeddings(self):
        return self.decoder
@@ -1076,6 +1205,17 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+

 class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
@@ -1198,6 +1338,17 @@ class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLos
            cross_attentions=outputs.cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+

 class TFRobertaClassificationHead(tf.keras.layers.Layer):
    """Head for sentence-level classification tasks."""
@@ -1217,6 +1368,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
        self.out_proj = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
+        self.config = config

    def call(self, features, training=False):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
@@ -1226,6 +1378,17 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
        x = self.out_proj(x)
        return x

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1302,6 +1465,17 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+

 @add_start_docstrings(
    """
@@ -1323,6 +1497,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
        self.classifier = tf.keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@@ -1392,6 +1567,17 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1417,6 +1603,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
        self.classifier = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1475,6 +1662,17 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1495,6 +1693,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
        self.qa_outputs = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1566,3 +1765,14 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
--- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -94,7 +94,7 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer):
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
@@ -116,7 +116,12 @@ class TFRobertaPreLayerNormEmbeddings(tf.keras.layers.Layer):
                initializer=get_initializer(self.initializer_range),
            )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])

    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
        """
@@ -189,6 +194,7 @@ class TFRobertaPreLayerNormPooler(tf.keras.layers.Layer):
            activation="tanh",
            name="dense",
        )
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
@@ -198,6 +204,14 @@ class TFRobertaPreLayerNormPooler(tf.keras.layers.Layer):

        return pooled_output

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm
 class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer):
@@ -227,6 +241,7 @@ class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer):
        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        self.is_decoder = config.is_decoder
+        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -316,6 +331,20 @@ class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer):
            outputs = outputs + (past_key_value,)
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+

 class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer):
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
@@ -325,6 +354,7 @@ class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer):
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -333,6 +363,14 @@ class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer):
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
@@ -341,6 +379,7 @@ class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer):
        self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self")
        self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output")
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config

    # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention.prune_heads
    def prune_heads(self, heads):
@@ -376,6 +415,20 @@ class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer):
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
@@ -390,6 +443,7 @@ class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.LayerNorm(inputs=hidden_states)
@@ -398,6 +452,17 @@ class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer):
    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
@@ -407,6 +472,7 @@ class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer):
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -415,6 +481,14 @@ class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RobertaPreLayerNorm
 class TFRobertaPreLayerNormLayer(tf.keras.layers.Layer):
@@ -502,6 +576,23 @@ class TFRobertaPreLayerNormLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm
 class TFRobertaPreLayerNormEncoder(tf.keras.layers.Layer):
@@ -572,6 +663,15 @@ class TFRobertaPreLayerNormEncoder(tf.keras.layers.Layer):
            cross_attentions=all_cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFRobertaPreLayerNormMainLayer(tf.keras.layers.Layer):
@@ -765,6 +865,23 @@ class TFRobertaPreLayerNormMainLayer(tf.keras.layers.Layer):
            cross_attentions=encoder_outputs.cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+

 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel):
@@ -948,6 +1065,14 @@ class TFRobertaPreLayerNormModel(TFRobertaPreLayerNormPreTrainedModel):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+

 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm
 class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
@@ -968,10 +1093,18 @@ class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
        # an output-only bias for each token.
        self.decoder = input_embeddings

-    def build(self, input_shape):
+    def build(self, input_shape=None):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])

    def get_output_embeddings(self):
        return self.decoder
@@ -1085,6 +1218,17 @@ class TFRobertaPreLayerNormForMaskedLM(TFRobertaPreLayerNormPreTrainedModel, TFM
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+

 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss):
@@ -1214,6 +1358,17 @@ class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFC
            cross_attentions=outputs.cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+

 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm
 class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
@@ -1234,6 +1389,7 @@ class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
        self.out_proj = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
+        self.config = config

    def call(self, features, training=False):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
@@ -1243,6 +1399,17 @@ class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
        x = self.out_proj(x)
        return x

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1322,6 +1489,17 @@ class TFRobertaPreLayerNormForSequenceClassification(
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+

 @add_start_docstrings(
    """
@@ -1344,6 +1522,7 @@ class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedMode
        self.classifier = tf.keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
@@ -1415,6 +1594,17 @@ class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedMode
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1442,6 +1632,7 @@ class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTraine
        self.classifier = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1499,6 +1690,17 @@ class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTraine
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1521,6 +1723,7 @@ class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedM
        self.qa_outputs = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1591,3 +1794,14 @@ class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedM
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -142,7 +142,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
@@ -157,7 +157,12 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
                initializer=get_initializer(self.initializer_range),
            )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])

    def call(
        self,
@@ -218,6 +223,7 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer):
        )
        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
        self.rotary_value = config.rotary_value
+        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -307,6 +313,20 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer):
            return query_layer, key_layer, value_layer
        return query_layer, key_layer

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RoFormer
 class TFRoFormerSelfOutput(tf.keras.layers.Layer):
@@ -318,6 +338,7 @@ class TFRoFormerSelfOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -326,6 +347,17 @@ class TFRoFormerSelfOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 class TFRoFormerAttention(tf.keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
@@ -361,6 +393,17 @@ class TFRoFormerAttention(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RoFormer
 class TFRoFormerIntermediate(tf.keras.layers.Layer):
@@ -375,6 +418,7 @@ class TFRoFormerIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -382,6 +426,14 @@ class TFRoFormerIntermediate(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RoFormer
 class TFRoFormerOutput(tf.keras.layers.Layer):
@@ -393,6 +445,7 @@ class TFRoFormerOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -401,6 +454,17 @@ class TFRoFormerOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 class TFRoFormerLayer(tf.keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
@@ -436,6 +500,20 @@ class TFRoFormerLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "roformer_output", None) is not None:
+            with tf.name_scope(self.roformer_output.name):
+                self.roformer_output.build(None)
+

 class TFRoFormerEncoder(tf.keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
@@ -491,6 +569,18 @@ class TFRoFormerEncoder(tf.keras.layers.Layer):
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
@@ -508,6 +598,7 @@ class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer):
            self.transform_act_fn = config.hidden_act

        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -516,6 +607,17 @@ class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])
+

 class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
@@ -530,10 +632,15 @@ class TFRoFormerLMPredictionHead(tf.keras.layers.Layer):
        # an output-only bias for each token.
        self.input_embeddings = input_embeddings

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transform", None) is not None:
+            with tf.name_scope(self.transform.name):
+                self.transform.build(None)

    def get_output_embeddings(self) -> tf.keras.layers.Layer:
        return self.input_embeddings
@@ -572,6 +679,14 @@ class TFRoFormerMLMHead(tf.keras.layers.Layer):

        return prediction_scores

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+

 @keras_serializable
 class TFRoFormerMainLayer(tf.keras.layers.Layer):
@@ -687,6 +802,20 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
            attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "embeddings_project", None) is not None:
+            with tf.name_scope(self.embeddings_project.name):
+                self.embeddings_project.build([None, None, self.config.embedding_size])
+

 class TFRoFormerPreTrainedModel(TFPreTrainedModel):
    """
@@ -834,6 +963,14 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+

 @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
 class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -904,6 +1041,17 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "mlm", None) is not None:
+            with tf.name_scope(self.mlm.name):
+                self.mlm.build(None)
+

 @add_start_docstrings(
    """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
@@ -977,6 +1125,17 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "mlm", None) is not None:
+            with tf.name_scope(self.mlm.name):
+                self.mlm.build(None)
+

 class TFRoFormerClassificationHead(tf.keras.layers.Layer):
    """Head for sentence-level classification tasks."""
@@ -996,6 +1155,7 @@ class TFRoFormerClassificationHead(tf.keras.layers.Layer):
            self.classifier_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.classifier_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
@@ -1007,6 +1167,17 @@ class TFRoFormerClassificationHead(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1075,6 +1246,17 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+

 @add_start_docstrings(
    """
@@ -1092,6 +1274,7 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
        self.classifier = tf.keras.layers.Dense(
            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
@@ -1167,6 +1350,20 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1186,6 +1383,7 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
        self.classifier = tf.keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1238,6 +1436,17 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1256,6 +1465,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
        self.qa_outputs = tf.keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1321,3 +1531,14 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -150,6 +150,14 @@ class TFSamPatchEmbeddings(tf.keras.layers.Layer):
        embeddings = self.projection(tf.transpose(pixel_values, perm=[0, 2, 3, 1]))
        return embeddings

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+

 class TFSamMLPBlock(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -157,6 +165,7 @@ class TFSamMLPBlock(tf.keras.layers.Layer):
        self.lin1 = tf.keras.layers.Dense(config.mlp_dim, name="lin1")
        self.lin2 = tf.keras.layers.Dense(config.hidden_size, name="lin2")
        self.act = ACT2FN[config.hidden_act]
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.lin1(hidden_states)
@@ -164,6 +173,17 @@ class TFSamMLPBlock(tf.keras.layers.Layer):
        hidden_states = self.lin2(hidden_states)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "lin1", None) is not None:
+            with tf.name_scope(self.lin1.name):
+                self.lin1.build([None, None, self.config.hidden_size])
+        if getattr(self, "lin2", None) is not None:
+            with tf.name_scope(self.lin2.name):
+                self.lin2.build([None, None, self.config.mlp_dim])
+

 class TFSamLayerNorm(tf.keras.layers.Layer):
    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
@@ -257,6 +277,23 @@ class TFSamAttention(tf.keras.layers.Layer):

        return out

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.hidden_size])
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.hidden_size])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.internal_dim])
+

 class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer):
    def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False, **kwargs):
@@ -345,6 +382,35 @@ class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "layer_norm1", None) is not None:
+            with tf.name_scope(self.layer_norm1.name):
+                self.layer_norm1.build([None, None, None, self.hidden_size])
+        if getattr(self, "cross_attn_token_to_image", None) is not None:
+            with tf.name_scope(self.cross_attn_token_to_image.name):
+                self.cross_attn_token_to_image.build(None)
+        if getattr(self, "layer_norm2", None) is not None:
+            with tf.name_scope(self.layer_norm2.name):
+                self.layer_norm2.build([None, None, None, self.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "layer_norm3", None) is not None:
+            with tf.name_scope(self.layer_norm3.name):
+                self.layer_norm3.build([None, None, None, self.hidden_size])
+        if getattr(self, "layer_norm4", None) is not None:
+            with tf.name_scope(self.layer_norm4.name):
+                self.layer_norm4.build([None, None, None, self.hidden_size])
+        if getattr(self, "cross_attn_image_to_token", None) is not None:
+            with tf.name_scope(self.cross_attn_image_to_token.name):
+                self.cross_attn_image_to_token.build(None)
+

 class TFSamTwoWayTransformer(tf.keras.layers.Layer):
    def __init__(self, config: SamMaskDecoderConfig, **kwargs):
@@ -412,6 +478,20 @@ class TFSamTwoWayTransformer(tf.keras.layers.Layer):
        queries = self.layer_norm_final_attn(queries)
        return queries, keys, all_attentions

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "final_attn_token_to_image", None) is not None:
+            with tf.name_scope(self.final_attn_token_to_image.name):
+                self.final_attn_token_to_image.build(None)
+        if getattr(self, "layer_norm_final_attn", None) is not None:
+            with tf.name_scope(self.layer_norm_final_attn.name):
+                self.layer_norm_final_attn.build([None, None, None, self.config.hidden_size])
+        for layer in self.layers:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+

 class TFSamFeedForward(tf.keras.layers.Layer):
    def __init__(
@@ -427,6 +507,8 @@ class TFSamFeedForward(tf.keras.layers.Layer):
            for i in range(num_layers - 2)
        ]
        self.sigmoid_output = sigmoid_output
+        self.hidden_dim = hidden_dim
+        self.input_dim = input_dim

    def call(self, hidden_states):
        hidden_states = self.proj_in(hidden_states)
@@ -439,6 +521,21 @@ class TFSamFeedForward(tf.keras.layers.Layer):
            hidden_states = tf.sigmoid(hidden_states)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj_in", None) is not None:
+            with tf.name_scope(self.proj_in.name):
+                self.proj_in.build([None, None, self.input_dim])
+        if getattr(self, "proj_out", None) is not None:
+            with tf.name_scope(self.proj_out.name):
+                self.proj_out.build([None, None, self.hidden_dim])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build([None, None, self.hidden_dim])
+

 class TFSamMaskDecoder(tf.keras.layers.Layer):
    def __init__(self, config: SamMaskDecoderConfig, **kwargs):
@@ -483,12 +580,30 @@ class TFSamMaskDecoder(tf.keras.layers.Layer):
            name="iou_prediction_head",
        )

-    def build(self, input_shape):
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
        self.iou_token = self.add_weight(shape=(1, self.hidden_size), name="iou_token.weight", trainable=True)
        self.mask_tokens = self.add_weight(
            shape=(self.num_mask_tokens, self.hidden_size), name="mask_tokens.weight", trainable=True
        )
-        super().build(input_shape)
+
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "upscale_conv1", None) is not None:
+            with tf.name_scope(self.upscale_conv1.name):
+                self.upscale_conv1.build([None, self.hidden_size, None, None])
+        if getattr(self, "upscale_conv2", None) is not None:
+            with tf.name_scope(self.upscale_conv2.name):
+                self.upscale_conv2.build([None, self.hidden_size // 4, None, None])
+        if getattr(self, "upscale_layer_norm", None) is not None:
+            with tf.name_scope(self.upscale_layer_norm.name):
+                self.upscale_layer_norm.build(None)
+        if getattr(self, "iou_prediction_head", None) is not None:
+            with tf.name_scope(self.iou_prediction_head.name):
+                self.iou_prediction_head.build(None)

    def call(
        self,
@@ -615,6 +730,7 @@ class TFSamMaskEmbedding(tf.keras.layers.Layer):
        self.conv3 = tf.keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3")
        self.layer_norm1 = TFSamLayerNorm(self.mask_input_channels, config.layer_norm_eps, name="layer_norm1")
        self.layer_norm2 = TFSamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps, name="layer_norm2")
+        self.config = config

    def call(self, masks):
        masks = tf.transpose(masks, perm=(0, 2, 3, 1))  # Convert to channels-last
@@ -629,24 +745,21 @@ class TFSamMaskEmbedding(tf.keras.layers.Layer):
        dense_embeddings = tf.transpose(dense_embeddings, perm=(0, 3, 1, 2))  # Convert back to channels-first
        return dense_embeddings

-    def build(self, input_shape):
+    def build(self, input_shape=None):
        # This class needs an explicit build method because it isn't called with the standard dummy inputs
-        conv1_shape = [None, None, None, 1]
-        conv2_shape = [None, None, None, self.mask_input_channels]
-        conv3_shape = [None, None, None, self.mask_input_channels * 4]
-        layer_norm1_shape = [None, None, None, self.mask_input_channels]
-        layer_norm2_shape = [None, None, None, self.mask_input_channels * 4]
+        if self.built:
+            return
+        self.built = True
        with tf.name_scope("conv1"):
-            self.conv1.build(conv1_shape)
+            self.conv1.build([None, None, None, 1])
        with tf.name_scope("conv2"):
-            self.conv2.build(conv2_shape)
+            self.conv2.build([None, None, None, self.mask_input_channels])
        with tf.name_scope("conv3"):
-            self.conv3.build(conv3_shape)
+            self.conv3.build([None, None, None, self.mask_input_channels * 4])
        with tf.name_scope("layer_norm1"):
-            self.layer_norm1.build(layer_norm1_shape)
+            self.layer_norm1.build([None, None, None, self.mask_input_channels])
        with tf.name_scope("layer_norm2"):
-            self.layer_norm2.build(layer_norm2_shape)
-        super().build(input_shape)
+            self.layer_norm2.build([None, None, None, self.mask_input_channels * 4])


 class TFSamPromptEncoder(tf.keras.layers.Layer):
@@ -664,7 +777,7 @@ class TFSamPromptEncoder(tf.keras.layers.Layer):
        self.not_a_point_embed = None
        self.config = config

-    def build(self, input_shape):
+    def build(self, input_shape=None):
        self.no_mask_embed = self.add_weight(
            name="no_mask_embed.weight",
            shape=(1, self.hidden_size),
@@ -691,7 +804,13 @@ class TFSamPromptEncoder(tf.keras.layers.Layer):
            self.mask_embed.build(
                (None, self.config.mask_input_channels, self.config.image_size, self.config.image_size)
            )
-        super().build(input_shape)
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "mask_embed", None) is not None:
+            with tf.name_scope(self.mask_embed.name):
+                self.mask_embed.build(None)

    def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.Tensor:
        """Embeds point prompts."""
@@ -812,7 +931,7 @@ class TFSamVisionAttention(tf.keras.layers.Layer):
                raise ValueError("Input size must be provided if using relative positional encoding.")
        self.config = config

-    def build(self, input_shape):
+    def build(self, input_shape=None):
        if self.input_size is not None:
            # initialize relative positional embeddings
            self.rel_pos_h = self.add_weight(
@@ -821,7 +940,16 @@ class TFSamVisionAttention(tf.keras.layers.Layer):
            self.rel_pos_w = self.add_weight(
                shape=(2 * self.input_size[1] - 1, self.head_dim), initializer="zeros", name="rel_pos_w"
            )
-        super().build(input_shape)
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "qkv", None) is not None:
+            with tf.name_scope(self.qkv.name):
+                self.qkv.build([None, None, self.config.hidden_size])
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, self.config.hidden_size])

    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: tf.Tensor) -> tf.Tensor:
        """
@@ -949,6 +1077,7 @@ class TFSamVisionLayer(tf.keras.layers.Layer):
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
        self.mlp = TFSamMLPBlock(config, name="mlp")
        self.window_size = window_size
+        self.config = config

    def window_partition(self, hidden_states: tf.Tensor, window_size: int) -> Tuple[tf.Tensor, Tuple[int, int]]:
        batch_size, height, width, channel = shape_list(hidden_states)
@@ -1016,6 +1145,23 @@ class TFSamVisionLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm1", None) is not None:
+            with tf.name_scope(self.layer_norm1.name):
+                self.layer_norm1.build([None, None, None, self.config.hidden_size])
+        if getattr(self, "attn", None) is not None:
+            with tf.name_scope(self.attn.name):
+                self.attn.build(None)
+        if getattr(self, "layer_norm2", None) is not None:
+            with tf.name_scope(self.layer_norm2.name):
+                self.layer_norm2.build([None, None, None, self.config.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+

 class TFSamVisionNeck(tf.keras.layers.Layer):
    def __init__(self, config: SamVisionConfig, **kwargs):
@@ -1047,6 +1193,23 @@ class TFSamVisionNeck(tf.keras.layers.Layer):
        hidden_states = tf.transpose(hidden_states, perm=[0, 3, 1, 2])
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv1", None) is not None:
+            with tf.name_scope(self.conv1.name):
+                self.conv1.build([None, None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm1", None) is not None:
+            with tf.name_scope(self.layer_norm1.name):
+                self.layer_norm1.build(None)
+        if getattr(self, "conv2", None) is not None:
+            with tf.name_scope(self.conv2.name):
+                self.conv2.build([None, None, None, self.config.output_channels])
+        if getattr(self, "layer_norm2", None) is not None:
+            with tf.name_scope(self.layer_norm2.name):
+                self.layer_norm2.build(None)
+

 class TFSamVisionEncoder(tf.keras.layers.Layer):
    def __init__(self, config: SamVisionConfig, **kwargs):
@@ -1069,7 +1232,10 @@ class TFSamVisionEncoder(tf.keras.layers.Layer):

        self.neck = TFSamVisionNeck(config, name="neck")

-    def build(self, input_shape):
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
        if self.config.use_abs_pos:
            # Initialize absolute positional embedding with pretrain image size.
            self.pos_embed = self.add_weight(
@@ -1083,7 +1249,16 @@ class TFSamVisionEncoder(tf.keras.layers.Layer):
                trainable=True,
                name="pos_embed",
            )
-        super().build(input_shape)
+
+        if getattr(self, "patch_embed", None) is not None:
+            with tf.name_scope(self.patch_embed.name):
+                self.patch_embed.build(None)
+        if getattr(self, "neck", None) is not None:
+            with tf.name_scope(self.neck.name):
+                self.neck.build(None)
+        for layer in self.layers:
+            with tf.name_scope(layer.name):
+                layer.build(None)

    def get_input_embeddings(self):
        return self.patch_embed
@@ -1463,3 +1638,20 @@ class TFSamModel(TFSamPreTrainedModel):
            vision_attentions=attns if self.config.output_attentions else None,
            mask_decoder_attentions=output.mask_decoder_attentions if self.config.output_attentions else None,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "shared_image_embedding", None) is not None:
+            with tf.name_scope(self.shared_image_embedding.name):
+                self.shared_image_embedding.build(None)
+        if getattr(self, "vision_encoder", None) is not None:
+            with tf.name_scope(self.vision_encoder.name):
+                self.vision_encoder.build(None)
+        if getattr(self, "prompt_encoder", None) is not None:
+            with tf.name_scope(self.prompt_encoder.name):
+                self.prompt_encoder.build(None)
+        if getattr(self, "mask_decoder", None) is not None:
+            with tf.name_scope(self.mask_decoder.name):
+                self.mask_decoder.build(None)
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -79,7 +79,7 @@ class TFSegformerDropPath(tf.keras.layers.Layer):
 class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
    """Construct the overlapping patch embeddings."""

-    def __init__(self, patch_size, stride, hidden_size, **kwargs):
+    def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs):
        super().__init__(**kwargs)
        self.padding = tf.keras.layers.ZeroPadding2D(padding=patch_size // 2)
        self.proj = tf.keras.layers.Conv2D(
@@ -87,6 +87,8 @@ class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
        )

        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size

    def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]:
        embeddings = self.proj(self.padding(pixel_values))
@@ -99,6 +101,17 @@ class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
        embeddings = self.layer_norm(embeddings)
        return embeddings, height, width

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, None, self.num_channels])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.hidden_size])
+

 class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer):
    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
@@ -196,18 +209,47 @@ class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer):
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.hidden_size])
+        if getattr(self, "sr", None) is not None:
+            with tf.name_scope(self.sr.name):
+                self.sr.build([None, None, None, self.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.hidden_size])
+

 class TFSegformerSelfOutput(tf.keras.layers.Layer):
    def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs):
        super().__init__(**kwargs)
        self.dense = tf.keras.layers.Dense(hidden_size, name="dense")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.hidden_size = hidden_size

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.hidden_size])
+

 class TFSegformerAttention(tf.keras.layers.Layer):
    def __init__(
@@ -237,6 +279,17 @@ class TFSegformerAttention(tf.keras.layers.Layer):
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self", None) is not None:
+            with tf.name_scope(self.self.name):
+                self.self.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+

 class TFSegformerDWConv(tf.keras.layers.Layer):
    def __init__(self, dim: int = 768, **kwargs):
@@ -244,6 +297,7 @@ class TFSegformerDWConv(tf.keras.layers.Layer):
        self.depthwise_convolution = tf.keras.layers.Conv2D(
            filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
        )
+        self.dim = dim

    def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
        batch_size = shape_list(hidden_states)[0]
@@ -257,6 +311,14 @@ class TFSegformerDWConv(tf.keras.layers.Layer):
        hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels))
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "depthwise_convolution", None) is not None:
+            with tf.name_scope(self.depthwise_convolution.name):
+                self.depthwise_convolution.build([None, None, None, self.dim])
+

 class TFSegformerMixFFN(tf.keras.layers.Layer):
    def __init__(
@@ -277,6 +339,8 @@ class TFSegformerMixFFN(tf.keras.layers.Layer):
            self.intermediate_act_fn = config.hidden_act
        self.dense2 = tf.keras.layers.Dense(out_features, name="dense2")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.hidden_features = hidden_features
+        self.in_features = in_features

    def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense1(hidden_states)
@@ -287,6 +351,20 @@ class TFSegformerMixFFN(tf.keras.layers.Layer):
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense1", None) is not None:
+            with tf.name_scope(self.dense1.name):
+                self.dense1.build([None, None, self.in_features])
+        if getattr(self, "depthwise_convolution", None) is not None:
+            with tf.name_scope(self.depthwise_convolution.name):
+                self.depthwise_convolution.build(None)
+        if getattr(self, "dense2", None) is not None:
+            with tf.name_scope(self.dense2.name):
+                self.dense2.build([None, None, self.hidden_features])
+

 class TFSegformerLayer(tf.keras.layers.Layer):
    """This corresponds to the Block class in the original implementation."""
@@ -314,6 +392,7 @@ class TFSegformerLayer(tf.keras.layers.Layer):
        self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
+        self.hidden_size = hidden_size

    def call(
        self,
@@ -347,6 +426,23 @@ class TFSegformerLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm_1", None) is not None:
+            with tf.name_scope(self.layer_norm_1.name):
+                self.layer_norm_1.build([None, None, self.hidden_size])
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "layer_norm_2", None) is not None:
+            with tf.name_scope(self.layer_norm_2.name):
+                self.layer_norm_2.build([None, None, self.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+

 class TFSegformerEncoder(tf.keras.layers.Layer):
    def __init__(self, config: SegformerConfig, **kwargs):
@@ -363,6 +459,7 @@ class TFSegformerEncoder(tf.keras.layers.Layer):
                TFSegformerOverlapPatchEmbeddings(
                    patch_size=config.patch_sizes[i],
                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
                    hidden_size=config.hidden_sizes[i],
                    name=f"patch_embeddings.{i}",
                )
@@ -449,6 +546,24 @@ class TFSegformerEncoder(tf.keras.layers.Layer):
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norms", None) is not None:
+            for layer, shape in zip(self.layer_norms, self.config.hidden_sizes):
+                with tf.name_scope(layer.name):
+                    layer.build([None, None, shape])
+        if getattr(self, "block", None) is not None:
+            for block in self.block:
+                for layer in block:
+                    with tf.name_scope(layer.name):
+                        layer.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            for layer in self.embeddings:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFSegformerMainLayer(tf.keras.layers.Layer):
@@ -509,6 +624,14 @@ class TFSegformerMainLayer(tf.keras.layers.Layer):
            attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+

 class TFSegformerPreTrainedModel(TFPreTrainedModel):
    """
@@ -605,6 +728,14 @@ class TFSegformerModel(TFSegformerPreTrainedModel):
        )
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+

 @add_start_docstrings(
    """
@@ -622,6 +753,7 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl

        # Classifier head
        self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier")
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -668,15 +800,27 @@ class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceCl
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_sizes[-1]])
+

 class TFSegformerMLP(tf.keras.layers.Layer):
    """
    Linear Embedding.
    """

-    def __init__(self, config: SegformerConfig, **kwargs):
+    def __init__(self, input_dim: int, config: SegformerConfig, **kwargs):
        super().__init__(**kwargs)
        self.proj = tf.keras.layers.Dense(config.decoder_hidden_size, name="proj")
+        self.input_dim = input_dim

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        height = shape_list(hidden_states)[1]
@@ -686,6 +830,14 @@ class TFSegformerMLP(tf.keras.layers.Layer):
        hidden_states = self.proj(hidden_states)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, self.input_dim])
+

 class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
    def __init__(self, config: SegformerConfig, **kwargs):
@@ -693,7 +845,7 @@ class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
        mlps = []
        for i in range(config.num_encoder_blocks):
-            mlp = TFSegformerMLP(config, name=f"linear_c.{i}")
+            mlp = TFSegformerMLP(config=config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
            mlps.append(mlp)
        self.mlps = mlps

@@ -741,6 +893,26 @@ class TFSegformerDecodeHead(TFSegformerPreTrainedModel):

        return logits

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "linear_fuse", None) is not None:
+            with tf.name_scope(self.linear_fuse.name):
+                self.linear_fuse.build(
+                    [None, None, None, self.config.decoder_hidden_size * self.config.num_encoder_blocks]
+                )
+        if getattr(self, "batch_norm", None) is not None:
+            with tf.name_scope(self.batch_norm.name):
+                self.batch_norm.build([None, None, None, self.config.decoder_hidden_size])
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, None, self.config.decoder_hidden_size])
+        if getattr(self, "mlps", None) is not None:
+            for layer in self.mlps:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @add_start_docstrings(
    """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
@@ -851,3 +1023,14 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+        if getattr(self, "decode_head", None) is not None:
+            with tf.name_scope(self.decode_head.name):
+                self.decode_head.build(None)
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -166,6 +166,15 @@ class TFConv1dSubsampler(tf.keras.layers.Layer):
            hidden_states = glu(hidden_states, axis=2)  # GLU over the Channel dimension
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv_layers", None) is not None:
+            for i, layer in enumerate(self.conv_layers):
+                with tf.name_scope(layer.name):
+                    layer.build([None, None, self.in_channels] if i == 0 else [None, None, self.mid_channels // 2])
+

 class TFSpeech2TextSinusoidalPositionalEmbedding(tf.keras.layers.Layer):
    """This module produces sinusoidal positional embeddings of any length."""
@@ -379,6 +388,23 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer):

        return attn_output, attn_weights, past_key_value

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+

 class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, config: Speech2TextConfig, **kwargs):
@@ -394,6 +420,7 @@ class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer):
        self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config

    def call(
        self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False
@@ -434,6 +461,26 @@ class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer):

        return hidden_states, self_attn_weights

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.encoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+

 class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, config: Speech2TextConfig, **kwargs):
@@ -463,6 +510,7 @@ class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer):
        self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config

    def call(
        self,
@@ -546,6 +594,32 @@ class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer):
            present_key_value,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "encoder_attn", None) is not None:
+            with tf.name_scope(self.encoder_attn.name):
+                self.encoder_attn.build(None)
+        if getattr(self, "encoder_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.encoder_attn_layer_norm.name):
+                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.decoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+

 class TFSpeech2TextPreTrainedModel(TFPreTrainedModel):
    config_class = Speech2TextConfig
@@ -870,6 +944,24 @@ class TFSpeech2TextEncoder(tf.keras.layers.Layer):
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build(None)
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFSpeech2TextDecoder(tf.keras.layers.Layer):
@@ -1092,6 +1184,24 @@ class TFSpeech2TextDecoder(tf.keras.layers.Layer):
                cross_attentions=all_cross_attns,
            )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_tokens", None) is not None:
+            with tf.name_scope(self.embed_tokens.name):
+                self.embed_tokens.build(None)
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFSpeech2TextMainLayer(tf.keras.layers.Layer):
@@ -1197,6 +1307,17 @@ class TFSpeech2TextMainLayer(tf.keras.layers.Layer):
            encoder_attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+

 @add_start_docstrings(
    "The bare Speech2Text Model outputting raw hidden-states without any specific head on top.",
@@ -1279,6 +1400,14 @@ class TFSpeech2TextModel(TFSpeech2TextPreTrainedModel):
            encoder_attentions=enc_attns,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+

 @add_start_docstrings(
    "The Speech2Text Model with a language modeling head. Can be used for summarization.",
@@ -1291,6 +1420,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
        self.lm_head = tf.keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head")
        # TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate
        self.supports_xla_generation = False
+        self.config = config

    def get_encoder(self):
        return self.model.encoder
@@ -1461,6 +1591,17 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        }

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build([None, None, self.config.d_model])
+
    def tf_to_pt_weight_rename(self, tf_weight):
        if tf_weight == "lm_head.weight":
            return tf_weight, "model.decoder.embed_tokens.weight"

--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -283,6 +283,7 @@ class TFSwinEmbeddings(tf.keras.layers.Layer):

        self.norm = tf.keras.layers.LayerNormalization(name="norm", epsilon=1e-5)
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+        self.config = config

    def build(self, input_shape: tf.TensorShape) -> None:
        if self.use_mask_token:
@@ -296,7 +297,19 @@ class TFSwinEmbeddings(tf.keras.layers.Layer):
            )
        else:
            self.position_embeddings = None
-        super().build(input_shape)
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embeddings", None) is not None:
+            with tf.name_scope(self.patch_embeddings.name):
+                self.patch_embeddings.build(None)
+        if getattr(self, "norm", None) is not None:
+            with tf.name_scope(self.norm.name):
+                self.norm.build([None, None, self.config.embed_dim])
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)

    def call(
        self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False
@@ -381,6 +394,14 @@ class TFSwinPatchEmbeddings(tf.keras.layers.Layer):
        embeddings = tf.transpose(embeddings, (0, 2, 1))
        return embeddings, output_dimensions

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+

 class TFSwinPatchMerging(tf.keras.layers.Layer):
    """
@@ -443,6 +464,17 @@ class TFSwinPatchMerging(tf.keras.layers.Layer):

        return input_feature

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "reduction", None) is not None:
+            with tf.name_scope(self.reduction.name):
+                self.reduction.build([None, None, 4 * self.dim])
+        if getattr(self, "norm", None) is not None:
+            with tf.name_scope(self.norm.name):
+                self.norm.build([None, None, 4 * self.dim])
+

 class TFSwinDropPath(tf.keras.layers.Layer):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
@@ -521,7 +553,19 @@ class TFSwinSelfAttention(tf.keras.layers.Layer):
        relative_coords = tf.stack([stack_0, stack_1], axis=2)

        self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32))
-        super().build(input_shape)
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.all_head_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.all_head_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.all_head_size])

    def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
        new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size]
@@ -597,12 +641,24 @@ class TFSwinSelfOutput(tf.keras.layers.Layer):
        super().__init__(**kwargs)
        self.dense = tf.keras.layers.Dense(dim, name="dense")
        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout")
+        self.dim = dim

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.dim])
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+

 class TFSwinAttention(tf.keras.layers.Layer):
    def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
@@ -631,6 +687,17 @@ class TFSwinAttention(tf.keras.layers.Layer):
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self", None) is not None:
+            with tf.name_scope(self.self.name):
+                self.self.build(None)
+        if getattr(self, "self_output", None) is not None:
+            with tf.name_scope(self.self_output.name):
+                self.self_output.build(None)
+

 class TFSwinIntermediate(tf.keras.layers.Layer):
    def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
@@ -640,24 +707,43 @@ class TFSwinIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.dim = dim

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.dim])
+

 class TFSwinOutput(tf.keras.layers.Layer):
    def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
        super().__init__(**kwargs)
        self.dense = tf.keras.layers.Dense(dim, name="dense")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, "dropout")
+        self.config = config
+        self.dim = dim

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, int(self.config.mlp_ratio * self.dim)])
+

 class TFSwinLayer(tf.keras.layers.Layer):
    def __init__(
@@ -684,6 +770,7 @@ class TFSwinLayer(tf.keras.layers.Layer):
        )
        self.intermediate = TFSwinIntermediate(config, dim, name="intermediate")
        self.swin_output = TFSwinOutput(config, dim, name="output")
+        self.dim = dim

    def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None:
        img_mask = tf.zeros((height, width))
@@ -789,6 +876,29 @@ class TFSwinLayer(tf.keras.layers.Layer):
        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
        return layer_outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layernorm_before", None) is not None:
+            with tf.name_scope(self.layernorm_before.name):
+                self.layernorm_before.build([None, None, self.dim])
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "drop_path", None) is not None:
+            with tf.name_scope(self.drop_path.name):
+                self.drop_path.build(None)
+        if getattr(self, "layernorm_after", None) is not None:
+            with tf.name_scope(self.layernorm_after.name):
+                self.layernorm_after.build([None, None, self.dim])
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "swin_output", None) is not None:
+            with tf.name_scope(self.swin_output.name):
+                self.swin_output.build(None)
+

 class TFSwinStage(tf.keras.layers.Layer):
    def __init__(
@@ -861,6 +971,18 @@ class TFSwinStage(tf.keras.layers.Layer):
            stage_outputs += layer_outputs[1:]
        return stage_outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "downsample", None) is not None:
+            with tf.name_scope(self.downsample.name):
+                self.downsample.build(None)
+        if getattr(self, "blocks", None) is not None:
+            for layer in self.blocks:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFSwinEncoder(tf.keras.layers.Layer):
    def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs):
@@ -941,6 +1063,15 @@ class TFSwinEncoder(tf.keras.layers.Layer):
            reshaped_hidden_states=all_reshaped_hidden_states,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 class TFSwinPreTrainedModel(TFPreTrainedModel):
    """
@@ -1160,6 +1291,20 @@ class TFSwinMainLayer(tf.keras.layers.Layer):
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.num_features])
+

 @add_start_docstrings(
    "The bare Swin Model transformer outputting raw hidden-states without any specific head on top.",
@@ -1217,6 +1362,14 @@ class TFSwinModel(TFSwinPreTrainedModel):

        return swin_outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "swin", None) is not None:
+            with tf.name_scope(self.swin.name):
+                self.swin.build(None)
+

 class TFSwinPixelShuffle(tf.keras.layers.Layer):
    """TF layer implementation of torch.nn.PixelShuffle"""
@@ -1251,6 +1404,7 @@ class TFSwinDecoder(tf.keras.layers.Layer):
            filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, strides=1, name="0"
        )
        self.pixel_shuffle = TFSwinPixelShuffle(config.encoder_stride, name="1")
+        self.config = config

    def call(self, x: tf.Tensor) -> tf.Tensor:
        hidden_states = x
@@ -1262,6 +1416,17 @@ class TFSwinDecoder(tf.keras.layers.Layer):
        hidden_states = tf.transpose(hidden_states, (0, 3, 1, 2))
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv2d", None) is not None:
+            with tf.name_scope(self.conv2d.name):
+                self.conv2d.build([None, None, None, self.config.hidden_size])
+        if getattr(self, "pixel_shuffle", None) is not None:
+            with tf.name_scope(self.pixel_shuffle.name):
+                self.pixel_shuffle.build(None)
+

 @add_start_docstrings(
    "Swin Model with a decoder on top for masked image modeling, as proposed in"
@@ -1372,6 +1537,17 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "swin", None) is not None:
+            with tf.name_scope(self.swin.name):
+                self.swin.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+

 @add_start_docstrings(
    """
@@ -1446,3 +1622,15 @@ class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificati
            attentions=outputs.attentions,
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "swin", None) is not None:
+            with tf.name_scope(self.swin.name):
+                self.swin.build(None)
+        if getattr(self, "classifier", None) is not None:
+            if hasattr(self.classifier, "name"):
+                with tf.name_scope(self.classifier.name):
+                    self.classifier.build([None, None, self.swin.num_features])
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -45,7 +45,6 @@ from ...modeling_tf_utils import (
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    ContextManagers,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
@@ -75,16 +74,17 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [


 class TFT5LayerNorm(tf.keras.layers.Layer):
-    def __init__(self, epsilon=1e-6, **kwargs):
+    def __init__(self, hidden_size, epsilon=1e-6, **kwargs):
        """
        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
        """
        super().__init__(**kwargs)
        self.variance_epsilon = epsilon
+        self.hidden_size = hidden_size

    def build(self, input_shape):
        """Build shared word embedding layer"""
-        self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
+        self.weight = self.add_weight("weight", shape=(self.hidden_size,), initializer="ones")
        super().build(input_shape)

    def call(self, hidden_states):
@@ -110,6 +110,7 @@ class TFT5DenseActDense(tf.keras.layers.Layer):
        )  # Update init weights as in flax
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
        self.act = get_tf_activation(config.dense_act_fn)
+        self.config = config

    def call(self, hidden_states, training=False):
        hidden_states = self.wi(hidden_states)
@@ -118,6 +119,17 @@ class TFT5DenseActDense(tf.keras.layers.Layer):
        hidden_states = self.wo(hidden_states)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wi", None) is not None:
+            with tf.name_scope(self.wi.name):
+                self.wi.build([None, None, self.config.d_model])
+        if getattr(self, "wo", None) is not None:
+            with tf.name_scope(self.wo.name):
+                self.wo.build([None, None, self.config.d_ff])
+

 class TFT5DenseGatedActDense(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -139,6 +151,7 @@ class TFT5DenseGatedActDense(tf.keras.layers.Layer):
        )  # Update init weights as in flax
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
        self.act = get_tf_activation(config.dense_act_fn)
+        self.config = config

    def call(self, hidden_states, training=False):
        hidden_gelu = self.act(self.wi_0(hidden_states))
@@ -148,6 +161,20 @@ class TFT5DenseGatedActDense(tf.keras.layers.Layer):
        hidden_states = self.wo(hidden_states)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wi_0", None) is not None:
+            with tf.name_scope(self.wi_0.name):
+                self.wi_0.build([None, None, self.config.d_model])
+        if getattr(self, "wi_1", None) is not None:
+            with tf.name_scope(self.wi_1.name):
+                self.wi_1.build([None, None, self.config.d_model])
+        if getattr(self, "wo", None) is not None:
+            with tf.name_scope(self.wo.name):
+                self.wo.build([None, None, self.config.d_ff])
+

 class TFT5LayerFF(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -157,7 +184,7 @@ class TFT5LayerFF(tf.keras.layers.Layer):
        else:
            self.DenseReluDense = TFT5DenseActDense(config, name="DenseReluDense")

-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
+        self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)

    def call(self, hidden_states, training=False):
@@ -166,6 +193,17 @@ class TFT5LayerFF(tf.keras.layers.Layer):
        hidden_states = hidden_states + self.dropout(dense_output, training=training)
        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build(None)
+        if getattr(self, "DenseReluDense", None) is not None:
+            with tf.name_scope(self.DenseReluDense.name):
+                self.DenseReluDense.build(None)
+

 class TFT5Attention(tf.keras.layers.Layer):
    NEW_ID = itertools.count()
@@ -218,7 +256,10 @@ class TFT5Attention(tf.keras.layers.Layer):

        self.pruned_heads = set()

-    def build(self, input_shape):
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
        if self.has_relative_attention_bias:
            with tf.name_scope("relative_attention_bias"):
                self.relative_attention_bias = self.add_weight(
@@ -226,8 +267,18 @@ class TFT5Attention(tf.keras.layers.Layer):
                    shape=[self.relative_attention_num_buckets, self.n_heads],
                    initializer=self.relative_attention_bias_initializer,  # Add initializer
                )
-
-        return super().build(input_shape)
+        if getattr(self, "q", None) is not None:
+            with tf.name_scope(self.q.name):
+                self.q.build([None, None, self.d_model])
+        if getattr(self, "k", None) is not None:
+            with tf.name_scope(self.k.name):
+                self.k.build([None, None, self.d_model])
+        if getattr(self, "v", None) is not None:
+            with tf.name_scope(self.v.name):
+                self.v.build([None, None, self.d_model])
+        if getattr(self, "o", None) is not None:
+            with tf.name_scope(self.o.name):
+                self.o.build([None, None, self.inner_dim])

    def prune_heads(self, heads):
        raise NotImplementedError
@@ -439,7 +490,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
            has_relative_attention_bias=has_relative_attention_bias,
            name="SelfAttention",
        )
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
+        self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)

    def call(
@@ -468,6 +519,17 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "SelfAttention", None) is not None:
+            with tf.name_scope(self.SelfAttention.name):
+                self.SelfAttention.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build(None)
+

 class TFT5LayerCrossAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
@@ -477,7 +539,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
            has_relative_attention_bias=False,
            name="EncDecAttention",
        )
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
+        self.layer_norm = TFT5LayerNorm(config.d_model, epsilon=config.layer_norm_epsilon, name="layer_norm")
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)

    def call(
@@ -510,6 +572,17 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "EncDecAttention", None) is not None:
+            with tf.name_scope(self.EncDecAttention.name):
+                self.EncDecAttention.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build(None)
+

 class TFT5Block(tf.keras.layers.Layer):
    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
@@ -613,6 +686,15 @@ class TFT5Block(tf.keras.layers.Layer):
        outputs = outputs + (present_key_value_state,) + attention_outputs
        return outputs  # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        for layer_module in self.layer:
+            if hasattr(layer_module, "name"):
+                with tf.name_scope(layer_module.name):
+                    layer_module.build(None)
+

 ####################################################
 # The full model without a specific pretrained or finetuning head is
@@ -640,7 +722,9 @@ class TFT5MainLayer(tf.keras.layers.Layer):
            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name=f"block_._{i}")
            for i in range(config.num_layers)
        ]
-        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
+        self.final_layer_norm = TFT5LayerNorm(
+            config.d_model, epsilon=config.layer_norm_epsilon, name="final_layer_norm"
+        )
        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)

    def _prune_heads(self, heads_to_prune):
@@ -679,14 +763,6 @@ class TFT5MainLayer(tf.keras.layers.Layer):

        if inputs_embeds is None:
            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
-            # if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
-            # scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
-            # is used with a name ending in `/`, that name replaces the current name scope.
-            # (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
-            context = []
-            if hasattr(self.embed_tokens, "load_weight_prefix"):
-                context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
-            with ContextManagers(context):
            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
            inputs_embeds = self.embed_tokens(input_ids)

@@ -846,6 +922,18 @@ class TFT5MainLayer(tf.keras.layers.Layer):
                attentions=all_attentions,
            )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build(None)
+        if getattr(self, "block", None) is not None:
+            for layer in self.block:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 ####################################################
 # TFT5PreTrainedModel is a sub-class of tf.keras.Model
@@ -1221,6 +1309,22 @@ class TFT5Model(TFT5PreTrainedModel):
            encoder_attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        # The shared/tied weights expect to be in the model base namespace
+        # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
+        # the current one.
+        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
+            self.shared.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+

 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
 class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss):
@@ -1250,6 +1354,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
            self.lm_head = tf.keras.layers.Dense(
                config.vocab_size, use_bias=False, name="lm_head", kernel_initializer=lm_head_initializer
            )  # Update init weights as in flax
+        self.config = config

    def get_output_embeddings(self):
        if self.config.tie_word_embeddings:
@@ -1471,6 +1576,25 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
        return self._shift_right(labels)

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        # The shared/tied weights expect to be in the model base namespace
+        # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
+        # the current one.
+        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
+            self.shared.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build([None, None, self.config.d_model])
+

 @add_start_docstrings(
    "The bare T5 Model transformer outputting encoder's raw hidden-stateswithout any specific head on top.",
@@ -1549,3 +1673,16 @@ class TFT5EncoderModel(TFT5PreTrainedModel):
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        # The shared/tied weights expect to be in the model base namespace
+        # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
+        # the current one.
+        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
+            self.shared.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -160,7 +160,7 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
@@ -186,7 +186,12 @@ class TFTapasEmbeddings(tf.keras.layers.Layer):
                    ),
                )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])

    def call(
        self,
@@ -279,6 +284,7 @@ class TFTapasSelfAttention(tf.keras.layers.Layer):
        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        self.is_decoder = config.is_decoder
+        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -368,6 +374,20 @@ class TFTapasSelfAttention(tf.keras.layers.Layer):
            outputs = outputs + (past_key_value,)
        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Tapas
 class TFTapasSelfOutput(tf.keras.layers.Layer):
@@ -379,6 +399,7 @@ class TFTapasSelfOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -387,6 +408,17 @@ class TFTapasSelfOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Tapas
 class TFTapasAttention(tf.keras.layers.Layer):
@@ -428,6 +460,17 @@ class TFTapasAttention(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Tapas
 class TFTapasIntermediate(tf.keras.layers.Layer):
@@ -442,6 +485,7 @@ class TFTapasIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -449,6 +493,14 @@ class TFTapasIntermediate(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Tapas
 class TFTapasOutput(tf.keras.layers.Layer):
@@ -460,6 +512,7 @@ class TFTapasOutput(tf.keras.layers.Layer):
        )
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -468,6 +521,17 @@ class TFTapasOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Tapas
 class TFTapasLayer(tf.keras.layers.Layer):
@@ -555,6 +619,23 @@ class TFTapasLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Tapas
 class TFTapasEncoder(tf.keras.layers.Layer):
@@ -625,6 +706,15 @@ class TFTapasEncoder(tf.keras.layers.Layer):
            cross_attentions=all_cross_attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Tapas
 class TFTapasPooler(tf.keras.layers.Layer):
@@ -637,6 +727,7 @@ class TFTapasPooler(tf.keras.layers.Layer):
            activation="tanh",
            name="dense",
        )
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
@@ -646,6 +737,14 @@ class TFTapasPooler(tf.keras.layers.Layer):

        return pooled_output

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Tapas
 class TFTapasPredictionHeadTransform(tf.keras.layers.Layer):
@@ -664,6 +763,7 @@ class TFTapasPredictionHeadTransform(tf.keras.layers.Layer):
            self.transform_act_fn = config.hidden_act

        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -672,6 +772,17 @@ class TFTapasPredictionHeadTransform(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+

 # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Tapas
 class TFTapasLMPredictionHead(tf.keras.layers.Layer):
@@ -687,10 +798,15 @@ class TFTapasLMPredictionHead(tf.keras.layers.Layer):
        # an output-only bias for each token.
        self.input_embeddings = input_embeddings

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transform", None) is not None:
+            with tf.name_scope(self.transform.name):
+                self.transform.build(None)

    def get_output_embeddings(self) -> tf.keras.layers.Layer:
        return self.input_embeddings
@@ -729,6 +845,14 @@ class TFTapasMLMHead(tf.keras.layers.Layer):

        return prediction_scores

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+

 @keras_serializable
 class TFTapasMainLayer(tf.keras.layers.Layer):
@@ -852,6 +976,20 @@ class TFTapasMainLayer(tf.keras.layers.Layer):
            attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+

 class TFTapasPreTrainedModel(TFPreTrainedModel):
    """
@@ -1033,6 +1171,14 @@ class TFTapasModel(TFTapasPreTrainedModel):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "tapas", None) is not None:
+            with tf.name_scope(self.tapas.name):
+                self.tapas.build(None)
+

 @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
 class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1129,6 +1275,17 @@ class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "tapas", None) is not None:
+            with tf.name_scope(self.tapas.name):
+                self.tapas.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+

 class TFTapasComputeTokenLogits(tf.keras.layers.Layer):
    def __init__(self, config: TapasConfig, **kwargs):
@@ -1552,6 +1709,23 @@ class TFTapasForQuestionAnswering(TFTapasPreTrainedModel):
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "tapas", None) is not None:
+            with tf.name_scope(self.tapas.name):
+                self.tapas.build(None)
+        if getattr(self, "compute_token_logits", None) is not None:
+            with tf.name_scope(self.compute_token_logits.name):
+                self.compute_token_logits.build(None)
+        if getattr(self, "compute_column_logits", None) is not None:
+            with tf.name_scope(self.compute_column_logits.name):
+                self.compute_column_logits.build(None)
+        if getattr(self, "aggregation_classifier", None) is not None:
+            with tf.name_scope(self.aggregation_classifier.name):
+                self.aggregation_classifier.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -1570,6 +1744,7 @@ class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassif
        self.classifier = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@@ -1654,6 +1829,20 @@ class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassif
            attentions=outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "tapas", None) is not None:
+            with tf.name_scope(self.tapas.name):
+                self.tapas.build(None)
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+

 """ TAPAS utilities."""


--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -684,3 +684,17 @@ class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLos
            "Resizing the embedding layers via the TFVisionEncoderDecoderModel directly is not supported. "
            "Please use the respective methods of the wrapped objects (model.decoder.resize_token_embeddings(...))"
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "enc_to_dec_proj", None) is not None:
+            with tf.name_scope(self.enc_to_dec_proj.name):
+                self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size])
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -220,12 +220,26 @@ class TFVisionTextDualEncoderModel(TFPreTrainedModel):
        self.visual_projection = Dense(self.projection_dim, use_bias=False, name="visual_projection")
        self.text_projection = Dense(self.projection_dim, use_bias=False, name="text_projection")
        self.logit_scale = None
+        self.config = config

    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
        # Build in the build() method to make sure the names are right
        initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value)
        self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale")
-        super().build(input_shape)
+
+        if getattr(self, "visual_projection", None) is not None:
+            with tf.name_scope(self.visual_projection.name):
+                self.visual_projection.build([None, None, self.vision_embed_dim])
+        if getattr(self, "text_projection", None) is not None:
+            with tf.name_scope(self.text_projection.name):
+                self.text_projection.build([None, None, self.text_embed_dim])
+        with tf.name_scope(self.vision_model.name):
+            self.vision_model.build(None)
+        with tf.name_scope(self.text_model.name):
+            self.text_model.build(None)

    def tf_to_pt_weight_rename(self, tf_weight):
        # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models

--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -66,7 +66,7 @@ class TFViTEmbeddings(tf.keras.layers.Layer):
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape=None):
        num_patches = self.patch_embeddings.num_patches
        self.cls_token = self.add_weight(
            shape=(1, 1, self.config.hidden_size),
@@ -81,7 +81,12 @@ class TFViTEmbeddings(tf.keras.layers.Layer):
            name="position_embeddings",
        )

-        super().build(input_shape)
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embeddings", None) is not None:
+            with tf.name_scope(self.patch_embeddings.name):
+                self.patch_embeddings.build(None)

    def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
        """
@@ -205,6 +210,14 @@ class TFViTPatchEmbeddings(tf.keras.layers.Layer):

        return embeddings

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+

 class TFViTSelfAttention(tf.keras.layers.Layer):
    def __init__(self, config: ViTConfig, **kwargs):
@@ -231,6 +244,7 @@ class TFViTSelfAttention(tf.keras.layers.Layer):
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -280,6 +294,20 @@ class TFViTSelfAttention(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+

 class TFViTSelfOutput(tf.keras.layers.Layer):
    """
@@ -294,6 +322,7 @@ class TFViTSelfOutput(tf.keras.layers.Layer):
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -301,6 +330,14 @@ class TFViTSelfOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 class TFViTAttention(tf.keras.layers.Layer):
    def __init__(self, config: ViTConfig, **kwargs):
@@ -329,6 +366,17 @@ class TFViTAttention(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+

 class TFViTIntermediate(tf.keras.layers.Layer):
    def __init__(self, config: ViTConfig, **kwargs):
@@ -342,6 +390,7 @@ class TFViTIntermediate(tf.keras.layers.Layer):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -349,6 +398,14 @@ class TFViTIntermediate(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 class TFViTOutput(tf.keras.layers.Layer):
    def __init__(self, config: ViTConfig, **kwargs):
@@ -358,6 +415,7 @@ class TFViTOutput(tf.keras.layers.Layer):
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_states = self.dense(inputs=hidden_states)
@@ -366,6 +424,14 @@ class TFViTOutput(tf.keras.layers.Layer):

        return hidden_states

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+

 class TFViTLayer(tf.keras.layers.Layer):
    """This corresponds to the Block class in the timm implementation."""
@@ -383,6 +449,7 @@ class TFViTLayer(tf.keras.layers.Layer):
        self.layernorm_after = tf.keras.layers.LayerNormalization(
            epsilon=config.layer_norm_eps, name="layernorm_after"
        )
+        self.config = config

    def call(
        self,
@@ -416,6 +483,26 @@ class TFViTLayer(tf.keras.layers.Layer):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "vit_output", None) is not None:
+            with tf.name_scope(self.vit_output.name):
+                self.vit_output.build(None)
+        if getattr(self, "layernorm_before", None) is not None:
+            with tf.name_scope(self.layernorm_before.name):
+                self.layernorm_before.build([None, None, self.config.hidden_size])
+        if getattr(self, "layernorm_after", None) is not None:
+            with tf.name_scope(self.layernorm_after.name):
+                self.layernorm_after.build([None, None, self.config.hidden_size])
+

 class TFViTEncoder(tf.keras.layers.Layer):
    def __init__(self, config: ViTConfig, **kwargs):
@@ -461,6 +548,15 @@ class TFViTEncoder(tf.keras.layers.Layer):
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+

 @keras_serializable
 class TFViTMainLayer(tf.keras.layers.Layer):
@@ -539,6 +635,23 @@ class TFViTMainLayer(tf.keras.layers.Layer):
            attentions=encoder_outputs.attentions,
        )

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+

 class TFViTPreTrainedModel(TFPreTrainedModel):
    """
@@ -665,6 +778,14 @@ class TFViTModel(TFViTPreTrainedModel):

        return outputs

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vit", None) is not None:
+            with tf.name_scope(self.vit.name):
+                self.vit.build(None)
+

 class TFViTPooler(tf.keras.layers.Layer):
    def __init__(self, config: ViTConfig, **kwargs):
@@ -676,6 +797,7 @@ class TFViTPooler(tf.keras.layers.Layer):
            activation="tanh",
            name="dense",
        )
+        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
@@ -685,6 +807,14 @@ class TFViTPooler(tf.keras.layers.Layer):

        return pooled_output

+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+

 @add_start_docstrings(
    """
@@ -714,6 +844,7 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )
+        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@@ -764,3 +895,14 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vit", None) is not None:
+            with tf.name_scope(self.vit.name):
+                self.vit.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])