"...lm-evaluation-harness.git" did not exist on "14043a0f762702bbb356073f50c066bcd9b275b2"
Unverified Commit 050e0b44 authored by Matt's avatar Matt Committed by GitHub
Browse files

Proper build() methods for TF (#27794)

* Add a convenience method for building in your own name scope

* Second attempt at auto layer building

* Revert "Second attempt at auto layer building"

This reverts commit e03a3aaecf9ec41a805582b83cbdfe3290a631be.

* Attempt #3

* Revert "Attempt #3"

This reverts commit b9df7a0857560d29b5abbed6127d9e9eca77cf47.

* Add missing attributes that we're going to need later

* Add some attributes we're going to need later

* A fourth attempt! Feel the power flow through you!

* Revert "A fourth attempt! Feel the power flow through you!"

This reverts commit 6bf4aaf3875d6f28485f50187617a4c616c8aff7.

* Add more values we'll need later

* TF refactor that we'll need later

* Revert "TF refactor that we'll need later"

This reverts commit ca07202fb5b7b7436b893baa8d688b4f348ea7b9.

* Revert "Revert "TF refactor that we'll need later""

This reverts commit 1beb0f39f293ed9c27594575e1c849aadeb15c13.

* make fixup

* Attempt five!

* Revert "Attempt five!"

This reverts commit 3302207958dfd0374b0447a51c06eea51a506044.

* Attempt six - this time don't add empty methods

* Revert "Attempt six - this time don't add empty methods"

This reverts commit 67d60129be75416b6beb8f47c7d38d77b18d79bb.

* Attempt seven - better base model class detection!

* Revert "Attempt seven - better base model class detection!"

This reverts commit 5f14845e92ea0e87c598da933bfbfee10f553bc9.

* Another attribute we'll need later

* Try again with the missing attribute!

* Revert "Try again with the missing attribute!"

This reverts commit 760c6f30c5dffb3e04b0e73c34a77d1882a0fef7.

* This is the attempt that will pierce the heavens!

* Revert "This is the attempt that will pierce the heavens!"

This reverts commit c868bb657de057aca7a5260350a3f831fc4dfee6.

* Attempt seven - snag list is steadily decreasing

* Revert "Attempt seven - snag list is steadily decreasing"

This reverts commit 46fbd975deda64429bfb3e5fac4fc0370c00d316.

* Attempt eight - will an empty snag list do it?

* Revert "Attempt eight - will an empty snag list do it?"

This reverts commit 7c8a3c2b083253649569e9877e02054ae5cec67b.

* Fixes to Hubert issues that cause problems later

* Trying again with Conv1D/SeparableConv fixes

* Revert "Trying again with Conv1D/SeparableConv fixes"

This reverts commit 55092bca952bc0f750aa1ffe246a640bf1e2036e.

* Apply the build shape fixes to Wav2Vec2 as well

* One more attempt!

* Revert "One more attempt!"

This reverts commit 5ac3e4cb01b9458cc93312873725f9444ae7261c.

* Another attempt!

* Revert "Another attempt!"

This reverts commit ea16d890e019d7de8792a3b8e72f3b1c02adae50.

* Let's see how many failures we get without the internal build method

* Fix OpenAI

* Fix MobileBERT

* (Mostly) fix GroupVIT

* Fix BLIP

* One more BLIP fix

* One more BLIP fix!

* Fix Regnet

* Finally fully fix GroupViT

* Fix Data2Vec and add the new AdaptivePool

* Fix Segformer

* Fix Albert

* Fix Deberta/DebertaV2

* Fix XLM

* Actually fix XLM

* Fix Flaubert

* Fix lxmert

* Fix Resnet

* Fix ConvBERT

* Fix ESM

* Fix Convnext / ConvnextV2

* Fix SAM

* Fix Efficientformer

* Fix LayoutLMv3

* Fix speech_to_text

* Fix mpnet and mobilevit

* Fix Swin

* Fix CTRL

* Fix CVT

* Fix DPR

* Fix Wav2Vec2

* Fix T5

* Fix Hubert

* Fix GPT2

* Fix Whisper

* Fix DeiT

* Fix the encoder-decoder / dual-encoder classes

* make fix-copies

* build in name scope

* Fix summarization test

* Fix tied weight names for BART + Blenderbot

* Fix tied weight name building

* Fix to TFESM weight building

* Update TF SAM

* Expand all the shapes out into Big Boy Shapes
parent 52c37882
......@@ -213,7 +213,7 @@ class TFViTMAEEmbeddings(tf.keras.layers.Layer):
self.config = config
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.cls_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
......@@ -233,7 +233,12 @@ class TFViTMAEEmbeddings(tf.keras.layers.Layer):
)[None, ...]
self.position_embeddings.assign(pos_embed)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None)
def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None):
"""
......@@ -352,6 +357,14 @@ class TFViTMAEPatchEmbeddings(tf.keras.layers.Layer):
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->ViTMAE
class TFViTMAESelfAttention(tf.keras.layers.Layer):
......@@ -379,6 +392,7 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer):
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -428,6 +442,20 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->ViTMAE
class TFViTMAESelfOutput(tf.keras.layers.Layer):
......@@ -443,6 +471,7 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -450,6 +479,14 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->ViTMAE
class TFViTMAEAttention(tf.keras.layers.Layer):
......@@ -479,6 +516,17 @@ class TFViTMAEAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->ViTMAE
class TFViTMAEIntermediate(tf.keras.layers.Layer):
......@@ -493,6 +541,7 @@ class TFViTMAEIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -500,6 +549,14 @@ class TFViTMAEIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->ViTMAE
class TFViTMAEOutput(tf.keras.layers.Layer):
......@@ -510,6 +567,7 @@ class TFViTMAEOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -518,6 +576,14 @@ class TFViTMAEOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTLayer with ViT->ViTMAE
class TFViTMAELayer(tf.keras.layers.Layer):
......@@ -536,6 +602,7 @@ class TFViTMAELayer(tf.keras.layers.Layer):
self.layernorm_after = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="layernorm_after"
)
self.config = config
def call(
self,
......@@ -569,6 +636,26 @@ class TFViTMAELayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "vit_output", None) is not None:
with tf.name_scope(self.vit_output.name):
self.vit_output.build(None)
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.config.hidden_size])
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->ViTMAE
class TFViTMAEEncoder(tf.keras.layers.Layer):
......@@ -615,6 +702,15 @@ class TFViTMAEEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFViTMAEMainLayer(tf.keras.layers.Layer):
......@@ -687,6 +783,20 @@ class TFViTMAEMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.hidden_size])
class TFViTMAEPreTrainedModel(TFPreTrainedModel):
"""
......@@ -829,6 +939,14 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vit", None) is not None:
with tf.name_scope(self.vit.name):
self.vit.build(None)
class TFViTMAEDecoder(tf.keras.layers.Layer):
def __init__(self, config, num_patches, **kwargs):
......@@ -853,7 +971,7 @@ class TFViTMAEDecoder(tf.keras.layers.Layer):
self.config = config
self.num_patches = num_patches
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.mask_token = self.add_weight(
shape=(1, 1, self.config.decoder_hidden_size),
initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
......@@ -873,7 +991,22 @@ class TFViTMAEDecoder(tf.keras.layers.Layer):
)[None, ...]
self.decoder_pos_embed.assign(decoder_pos_embed)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "decoder_embed", None) is not None:
with tf.name_scope(self.decoder_embed.name):
self.decoder_embed.build([None, None, self.config.hidden_size])
if getattr(self, "decoder_norm", None) is not None:
with tf.name_scope(self.decoder_norm.name):
self.decoder_norm.build([None, None, self.config.decoder_hidden_size])
if getattr(self, "decoder_pred", None) is not None:
with tf.name_scope(self.decoder_pred.name):
self.decoder_pred.build([None, None, self.config.decoder_hidden_size])
if getattr(self, "decoder_layers", None) is not None:
for layer in self.decoder_layers:
with tf.name_scope(layer.name):
layer.build(None)
def call(
self,
......@@ -1128,3 +1261,14 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vit", None) is not None:
with tf.name_scope(self.vit.name):
self.vit.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
......@@ -450,11 +450,6 @@ class TFWav2Vec2WeightNormConv1D(tf.keras.layers.Conv1D):
def build(self, input_shape):
if not self.built:
input_shape = input_shape.as_list()
# If a specific input shape is passed in, we need to modify it to account for padding
# Not necessary if those portions of the shape are None
if input_shape[-2] is not None:
input_shape[-2] += self.explicit_padding * 2
super().build(input_shape)
self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
......@@ -502,6 +497,14 @@ class TFWav2Vec2NoLayerNormConvLayer(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None:
......@@ -525,6 +528,17 @@ class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.out_conv_dim])
class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None:
......@@ -550,6 +564,17 @@ class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.out_conv_dim])
class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
......@@ -563,6 +588,7 @@ class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer):
)
self.padding = TFWav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
self.activation = get_tf_activation(config.feat_extract_activation)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.conv(hidden_states)
......@@ -570,6 +596,14 @@ class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.config.hidden_size])
class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer):
def __init__(self, num_conv_pos_embeddings, **kwargs):
......@@ -608,6 +642,15 @@ class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer):
hidden_states = conv_layer(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv_layers", None) is not None:
for conv_layer in self.conv_layers:
with tf.name_scope(conv_layer.name):
conv_layer.build(None)
class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder):
def __init__(self, config, **kwargs):
......@@ -632,6 +675,7 @@ class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
name="projection",
)
self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
norm_hidden_states = self.layer_norm(hidden_states)
......@@ -639,6 +683,17 @@ class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states, norm_hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.conv_dim[-1]])
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.config.conv_dim[-1]])
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFWav2Vec2
class TFWav2Vec2Attention(tf.keras.layers.Layer):
......@@ -793,6 +848,23 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFWav2Vec2FeedForward(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs):
......@@ -815,6 +887,7 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer):
name="output_dense",
)
self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.intermediate_dense(hidden_states)
......@@ -825,6 +898,17 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer):
hidden_states = self.output_dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "intermediate_dense", None) is not None:
with tf.name_scope(self.intermediate_dense.name):
self.intermediate_dense.build([None, None, self.config.hidden_size])
if getattr(self, "output_dense", None) is not None:
with tf.name_scope(self.output_dense.name):
self.output_dense.build([None, None, self.config.intermediate_size])
class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs):
......@@ -842,6 +926,7 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer):
self.final_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="final_layer_norm"
)
self.config = config
def call(
self,
......@@ -868,6 +953,23 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "feed_forward", None) is not None:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs):
......@@ -885,6 +987,7 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer):
self.final_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="final_layer_norm"
)
self.config = config
def call(
self,
......@@ -909,6 +1012,23 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "feed_forward", None) is not None:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
class TFWav2Vec2Encoder(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs):
......@@ -974,6 +1094,21 @@ class TFWav2Vec2Encoder(tf.keras.layers.Layer):
attentions=all_self_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "pos_conv_embed", None) is not None:
with tf.name_scope(self.pos_conv_embed.name):
self.pos_conv_embed.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs):
......@@ -1041,6 +1176,21 @@ class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer):
attentions=all_self_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "pos_conv_embed", None) is not None:
with tf.name_scope(self.pos_conv_embed.name):
self.pos_conv_embed.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
......@@ -1057,12 +1207,23 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
else:
self.encoder = TFWav2Vec2Encoder(config, name="encoder")
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
self.masked_spec_embed = self.add_weight(
shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed"
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "feature_extractor", None) is not None:
with tf.name_scope(self.feature_extractor.name):
self.feature_extractor.build(None)
if getattr(self, "feature_projection", None) is not None:
with tf.name_scope(self.feature_projection.name):
self.feature_projection.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
"""
......@@ -1419,6 +1580,14 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wav2vec2", None) is not None:
with tf.name_scope(self.wav2vec2.name):
self.wav2vec2.build(None)
@add_start_docstrings(
"""TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
......@@ -1431,6 +1600,9 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2")
self.dropout = tf.keras.layers.Dropout(config.final_dropout)
self.lm_head = tf.keras.layers.Dense(config.vocab_size, name="lm_head")
self.output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
def freeze_feature_extractor(self):
"""
......@@ -1572,6 +1744,17 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wav2vec2", None) is not None:
with tf.name_scope(self.wav2vec2.name):
self.wav2vec2.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.output_hidden_size])
class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel):
def __init__(self, config):
......@@ -1669,3 +1852,17 @@ class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel):
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wav2vec2", None) is not None:
with tf.name_scope(self.wav2vec2.name):
self.wav2vec2.build(None)
if getattr(self, "projector", None) is not None:
with tf.name_scope(self.projector.name):
self.projector.build([None, None, self.config.hidden_size])
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.classifier_proj_size])
......@@ -313,6 +313,23 @@ class TFWhisperAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextEncoderLayer with Speech2Text->Whisper
class TFWhisperEncoderLayer(tf.keras.layers.Layer):
......@@ -329,6 +346,7 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False
......@@ -369,6 +387,26 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextDecoderLayer with Speech2Text->Whisper
class TFWhisperDecoderLayer(tf.keras.layers.Layer):
......@@ -399,6 +437,7 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
......@@ -482,6 +521,32 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer):
present_key_value,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFWhisperPreTrainedModel(TFPreTrainedModel):
config_class = WhisperConfig
......@@ -749,6 +814,27 @@ class TFWhisperEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv1", None) is not None:
with tf.name_scope(self.conv1.name):
self.conv1.build([None, None, self.num_mel_bins])
if getattr(self, "conv2", None) is not None:
with tf.name_scope(self.conv2.name):
self.conv2.build([None, None, self.embed_dim])
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "encoder_layers", None) is not None:
for layer in self.encoder_layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFWhisperDecoder(tf.keras.layers.Layer):
......@@ -988,6 +1074,24 @@ class TFWhisperDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "decoder_layers", None) is not None:
for layer in self.decoder_layers:
with tf.name_scope(layer.name):
layer.build(None)
@add_start_docstrings(
"The bare Whisper Model outputting raw hidden-states without any specific head on top.",
......@@ -1111,6 +1215,17 @@ class TFWhisperMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"The bare Whisper Model outputting raw hidden-states without any specific head on top.",
......@@ -1219,6 +1334,14 @@ class TFWhisperModel(TFWhisperPreTrainedModel):
encoder_attentions=enc_attns,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
@add_start_docstrings(
"The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
......@@ -1630,3 +1753,11 @@ class TFWhisperForConditionalGeneration(TFWhisperPreTrainedModel, TFCausalLangua
"decoder_attention_mask": decoder_attention_mask,
"decoder_position_ids": decoder_position_ids,
}
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
......@@ -301,6 +301,23 @@ class TFXGLMAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFXGLMDecoderLayer(tf.keras.layers.Layer):
def __init__(self, config: XGLMConfig, **kwargs: Any) -> None:
......@@ -333,6 +350,7 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call
def call(
......@@ -415,6 +433,32 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer):
present_key_value,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
@keras_serializable
class TFXGLMMainLayer(tf.keras.layers.Layer):
......@@ -609,6 +653,21 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFXGLMPreTrainedModel(TFPreTrainedModel):
config_class = XGLMConfig
......@@ -792,6 +851,14 @@ class TFXGLMModel(TFXGLMPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
@add_start_docstrings(
"""
......@@ -822,6 +889,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
kernel_initializer=get_initializer(config.init_std),
name="lm_head",
)
self.config = config
def get_output_embeddings(self):
return self.lm_head
......@@ -925,6 +993,17 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.config.hidden_size])
def tf_to_pt_weight_rename(self, tf_weight):
if tf_weight == "lm_head.weight":
return tf_weight, "model.embed_tokens.weight"
......
......@@ -132,6 +132,7 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer):
self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.pruned_heads = set()
self.dim = dim
def prune_heads(self, heads):
raise NotImplementedError
......@@ -206,6 +207,23 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_lin", None) is not None:
with tf.name_scope(self.q_lin.name):
self.q_lin.build([None, None, self.dim])
if getattr(self, "k_lin", None) is not None:
with tf.name_scope(self.k_lin.name):
self.k_lin.build([None, None, self.dim])
if getattr(self, "v_lin", None) is not None:
with tf.name_scope(self.v_lin.name):
self.v_lin.build([None, None, self.dim])
if getattr(self, "out_lin", None) is not None:
with tf.name_scope(self.out_lin.name):
self.out_lin.build([None, None, self.dim])
class TFXLMTransformerFFN(tf.keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
......@@ -215,6 +233,8 @@ class TFXLMTransformerFFN(tf.keras.layers.Layer):
self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.in_dim = in_dim
self.dim_hidden = dim_hidden
def call(self, input, training=False):
x = self.lin1(input)
......@@ -224,6 +244,17 @@ class TFXLMTransformerFFN(tf.keras.layers.Layer):
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.in_dim])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.dim_hidden])
@keras_serializable
class TFXLMMainLayer(tf.keras.layers.Layer):
......@@ -316,7 +347,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
if self.attentions[int(layer)].n_heads == config.n_heads:
self.prune_heads({int(layer): list(map(int, heads))})
def build(self, input_shape):
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
......@@ -331,8 +365,24 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
shape=[self.n_langs, self.dim],
initializer=get_initializer(self.embed_init_std),
)
super().build(input_shape)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "layer_norm_emb", None) is not None:
with tf.name_scope(self.layer_norm_emb.name):
self.layer_norm_emb.build([None, None, self.dim])
for layer in self.attentions:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm1:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
for layer in self.ffns:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm2:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
def get_input_embeddings(self):
return self.embeddings
......@@ -734,6 +784,14 @@ class TFXLMModel(TFXLMPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFXLMPredLayer(tf.keras.layers.Layer):
"""
......@@ -871,6 +929,17 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "pred_layer", None) is not None:
with tf.name_scope(self.pred_layer.name):
self.pred_layer.build(None)
@add_start_docstrings(
"""
......@@ -949,6 +1018,17 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
@add_start_docstrings(
"""
......@@ -966,6 +1046,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
self.logits_proj = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
self.config = config
@property
def dummy_inputs(self):
......@@ -1068,6 +1149,20 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.num_labels])
@add_start_docstrings(
"""
......@@ -1086,6 +1181,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1148,6 +1244,17 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1163,6 +1270,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1238,3 +1346,14 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -178,7 +178,7 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape):
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
......@@ -200,7 +200,12 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
"""
......@@ -273,6 +278,7 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer):
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
......@@ -282,6 +288,14 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer):
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta
class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
......@@ -311,6 +325,7 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
......@@ -400,6 +415,20 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta
class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
......@@ -411,6 +440,7 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -419,6 +449,17 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta
class TFXLMRobertaAttention(tf.keras.layers.Layer):
......@@ -460,6 +501,17 @@ class TFXLMRobertaAttention(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta
class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
......@@ -474,6 +526,7 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -481,6 +534,14 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta
class TFXLMRobertaOutput(tf.keras.layers.Layer):
......@@ -492,6 +553,7 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer):
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
......@@ -500,6 +562,17 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer):
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta
class TFXLMRobertaLayer(tf.keras.layers.Layer):
......@@ -587,6 +660,23 @@ class TFXLMRobertaLayer(tf.keras.layers.Layer):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta
class TFXLMRobertaEncoder(tf.keras.layers.Layer):
......@@ -657,6 +747,15 @@ class TFXLMRobertaEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta
......@@ -855,6 +954,20 @@ class TFXLMRobertaMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta
class TFXLMRobertaPreTrainedModel(TFPreTrainedModel):
......@@ -940,6 +1053,14 @@ class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta
class TFXLMRobertaLMHead(tf.keras.layers.Layer):
......@@ -960,10 +1081,18 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer):
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
......@@ -1072,6 +1201,17 @@ class TFXLMRobertaForMaskedLM(TFXLMRobertaPreTrainedModel, TFMaskedLanguageModel
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
@add_start_docstrings(
"XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
......@@ -1199,6 +1339,17 @@ class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModel
cross_attentions=outputs.cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta
class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
......@@ -1219,6 +1370,7 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
......@@ -1228,6 +1380,17 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1305,6 +1468,17 @@ class TFXLMRobertaForSequenceClassification(TFXLMRobertaPreTrainedModel, TFSeque
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings(
"""
......@@ -1327,6 +1501,7 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
......@@ -1398,6 +1573,17 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1424,6 +1610,7 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1482,6 +1669,17 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1503,6 +1701,7 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1574,3 +1773,14 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -85,8 +85,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.config = config
def build(self, input_shape):
def build(self, input_shape=None):
initializer = get_initializer(self.initializer_range)
self.q = self.add_weight(
shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
......@@ -115,7 +116,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self.seg_embed = self.add_weight(
shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
def prune_heads(self, heads):
raise NotImplementedError
......@@ -344,6 +351,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
self.activation_function = get_tf_activation(config.ff_activation)
else:
self.activation_function = config.ff_activation
self.config = config
def call(self, inp, training=False):
output = inp
......@@ -355,6 +363,20 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
output = self.layer_norm(output + inp)
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layer_1", None) is not None:
with tf.name_scope(self.layer_1.name):
self.layer_1.build([None, None, self.config.d_model])
if getattr(self, "layer_2", None) is not None:
with tf.name_scope(self.layer_2.name):
self.layer_2.build([None, None, self.config.d_inner])
class TFXLNetLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
......@@ -399,6 +421,17 @@ class TFXLNetLayer(tf.keras.layers.Layer):
outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rel_attn", None) is not None:
with tf.name_scope(self.rel_attn.name):
self.rel_attn.build(None)
if getattr(self, "ff", None) is not None:
with tf.name_scope(self.ff.name):
self.ff.build(None)
class TFXLNetLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
......@@ -471,12 +504,22 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self.word_embedding.weight = value
self.word_embedding.vocab_size = shape_list(value)[0]
def build(self, input_shape):
def build(self, input_shape=None):
initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight(
shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
)
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "word_embedding", None) is not None:
with tf.name_scope(self.word_embedding.name):
self.word_embedding.build(None)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
......@@ -1177,6 +1220,14 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
@add_start_docstrings(
"""
......@@ -1336,6 +1387,17 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "lm_loss", None) is not None:
with tf.name_scope(self.lm_loss.name):
self.lm_loss.build(None)
@add_start_docstrings(
"""
......@@ -1356,6 +1418,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
self.logits_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1423,6 +1486,20 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.d_model])
@add_start_docstrings(
"""
......@@ -1442,6 +1519,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
self.logits_proj = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
......@@ -1524,6 +1602,20 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.d_model])
@add_start_docstrings(
"""
......@@ -1541,6 +1633,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1604,6 +1697,17 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
......@@ -1619,6 +1723,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
......@@ -1697,3 +1802,14 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
......@@ -2161,16 +2161,8 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos
......@@ -2359,16 +2351,8 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer):
positions = self.embed_positions(input_shape, past_key_values_length)
if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope`
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids)
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids)
hidden_states = inputs_embeds
......@@ -2578,6 +2562,13 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + '/' + self.shared.name + '/'):
self.shared.build(None)
@add_start_docstrings(
"The bare {{cookiecutter.uppercase_modelname}} Model outputting raw hidden-states without any specific head on top.",
......
......@@ -1071,9 +1071,9 @@ class TFEncoderDecoderModelSaveLoadTests(unittest.TestCase):
# create two random BERT models for bert2bert & initialize weights (+cross_attention weights)
encoder = TFBertModel(config.encoder)
encoder.build()
encoder.build_in_name_scope()
decoder = TFBertLMHeadModel(config.decoder)
decoder.build()
decoder.build_in_name_scope()
encoder_decoder_orig = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)
......
......@@ -180,7 +180,7 @@ class TFOPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
else:
# Here we build the word embeddings weights if not exists.
# And then we retry to get the attribute once built.
model.build()
model.build_in_name_scope()
if hasattr(embedding_layer, "weight"):
return embedding_layer.weight
else:
......
......@@ -729,9 +729,9 @@ class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase):
# create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
encoder = TFViTModel(config.encoder)
encoder.build()
encoder.build_in_name_scope()
decoder = TFGPT2LMHeadModel(config.decoder)
decoder.build()
decoder.build_in_name_scope()
encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
......
......@@ -290,7 +290,7 @@ class TFWhisperModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
for model_class in self.all_model_classes:
model = model_class(config)
model.build()
model.build_in_name_scope()
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, saved_model=False)
......
......@@ -21,7 +21,7 @@ from transformers import (
TFPreTrainedModel,
pipeline,
)
from transformers.testing_utils import get_gpu_count, is_pipeline_test, require_tf, require_torch, slow, torch_device
from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, slow, torch_device
from transformers.tokenization_utils import TruncationStrategy
from .test_pipelines_common import ANY
......@@ -67,8 +67,8 @@ class SummarizationPipelineTests(unittest.TestCase):
# the embedding layer.
if not (
isinstance(model, TFPreTrainedModel)
and get_gpu_count() > 0
and len(summarizer.model.trainable_weights) > 0
and "GPU" in summarizer.model.trainable_weights[0].device
):
with self.assertRaises(Exception):
outputs = summarizer("This " * 1000)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment