Unverified Commit 050e0b44 authored by Matt's avatar Matt Committed by GitHub
Browse files

Proper build() methods for TF (#27794)

* Add a convenience method for building in your own name scope

* Second attempt at auto layer building

* Revert "Second attempt at auto layer building"

This reverts commit e03a3aaecf9ec41a805582b83cbdfe3290a631be.

* Attempt #3

* Revert "Attempt #3"

This reverts commit b9df7a0857560d29b5abbed6127d9e9eca77cf47.

* Add missing attributes that we're going to need later

* Add some attributes we're going to need later

* A fourth attempt! Feel the power flow through you!

* Revert "A fourth attempt! Feel the power flow through you!"

This reverts commit 6bf4aaf3875d6f28485f50187617a4c616c8aff7.

* Add more values we'll need later

* TF refactor that we'll need later

* Revert "TF refactor that we'll need later"

This reverts commit ca07202fb5b7b7436b893baa8d688b4f348ea7b9.

* Revert "Revert "TF refactor that we'll need later""

This reverts commit 1beb0f39f293ed9c27594575e1c849aadeb15c13.

* make fixup

* Attempt five!

* Revert "Attempt five!"

This reverts commit 3302207958dfd0374b0447a51c06eea51a506044.

* Attempt six - this time don't add empty methods

* Revert "Attempt six - this time don't add empty methods"

This reverts commit 67d60129be75416b6beb8f47c7d38d77b18d79bb.

* Attempt seven - better base model class detection!

* Revert "Attempt seven - better base model class detection!"

This reverts commit 5f14845e92ea0e87c598da933bfbfee10f553bc9.

* Another attribute we'll need later

* Try again with the missing attribute!

* Revert "Try again with the missing attribute!"

This reverts commit 760c6f30c5dffb3e04b0e73c34a77d1882a0fef7.

* This is the attempt that will pierce the heavens!

* Revert "This is the attempt that will pierce the heavens!"

This reverts commit c868bb657de057aca7a5260350a3f831fc4dfee6.

* Attempt seven - snag list is steadily decreasing

* Revert "Attempt seven - snag list is steadily decreasing"

This reverts commit 46fbd975deda64429bfb3e5fac4fc0370c00d316.

* Attempt eight - will an empty snag list do it?

* Revert "Attempt eight - will an empty snag list do it?"

This reverts commit 7c8a3c2b083253649569e9877e02054ae5cec67b.

* Fixes to Hubert issues that cause problems later

* Trying again with Conv1D/SeparableConv fixes

* Revert "Trying again with Conv1D/SeparableConv fixes"

This reverts commit 55092bca952bc0f750aa1ffe246a640bf1e2036e.

* Apply the build shape fixes to Wav2Vec2 as well

* One more attempt!

* Revert "One more attempt!"

This reverts commit 5ac3e4cb01b9458cc93312873725f9444ae7261c.

* Another attempt!

* Revert "Another attempt!"

This reverts commit ea16d890e019d7de8792a3b8e72f3b1c02adae50.

* Let's see how many failures we get without the internal build method

* Fix OpenAI

* Fix MobileBERT

* (Mostly) fix GroupVIT

* Fix BLIP

* One more BLIP fix

* One more BLIP fix!

* Fix Regnet

* Finally fully fix GroupViT

* Fix Data2Vec and add the new AdaptivePool

* Fix Segformer

* Fix Albert

* Fix Deberta/DebertaV2

* Fix XLM

* Actually fix XLM

* Fix Flaubert

* Fix lxmert

* Fix Resnet

* Fix ConvBERT

* Fix ESM

* Fix Convnext / ConvnextV2

* Fix SAM

* Fix Efficientformer

* Fix LayoutLMv3

* Fix speech_to_text

* Fix mpnet and mobilevit

* Fix Swin

* Fix CTRL

* Fix CVT

* Fix DPR

* Fix Wav2Vec2

* Fix T5

* Fix Hubert

* Fix GPT2

* Fix Whisper

* Fix DeiT

* Fix the encoder-decoder / dual-encoder classes

* make fix-copies

* build in name scope

* Fix summarization test

* Fix tied weight names for BART + Blenderbot

* Fix tied weight name building

* Fix to TFESM weight building

* Update TF SAM

* Expand all the shapes out into Big Boy Shapes
parent 52c37882
...@@ -213,7 +213,7 @@ class TFViTMAEEmbeddings(tf.keras.layers.Layer): ...@@ -213,7 +213,7 @@ class TFViTMAEEmbeddings(tf.keras.layers.Layer):
self.config = config self.config = config
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
self.cls_token = self.add_weight( self.cls_token = self.add_weight(
shape=(1, 1, self.config.hidden_size), shape=(1, 1, self.config.hidden_size),
initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
...@@ -233,7 +233,12 @@ class TFViTMAEEmbeddings(tf.keras.layers.Layer): ...@@ -233,7 +233,12 @@ class TFViTMAEEmbeddings(tf.keras.layers.Layer):
)[None, ...] )[None, ...]
self.position_embeddings.assign(pos_embed) self.position_embeddings.assign(pos_embed)
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None)
def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None): def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None):
""" """
...@@ -352,6 +357,14 @@ class TFViTMAEPatchEmbeddings(tf.keras.layers.Layer): ...@@ -352,6 +357,14 @@ class TFViTMAEPatchEmbeddings(tf.keras.layers.Layer):
return x return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->ViTMAE # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->ViTMAE
class TFViTMAESelfAttention(tf.keras.layers.Layer): class TFViTMAESelfAttention(tf.keras.layers.Layer):
...@@ -379,6 +392,7 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer): ...@@ -379,6 +392,7 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer):
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
...@@ -428,6 +442,20 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer): ...@@ -428,6 +442,20 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->ViTMAE # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->ViTMAE
class TFViTMAESelfOutput(tf.keras.layers.Layer): class TFViTMAESelfOutput(tf.keras.layers.Layer):
...@@ -443,6 +471,7 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer): ...@@ -443,6 +471,7 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -450,6 +479,14 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer): ...@@ -450,6 +479,14 @@ class TFViTMAESelfOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->ViTMAE # Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->ViTMAE
class TFViTMAEAttention(tf.keras.layers.Layer): class TFViTMAEAttention(tf.keras.layers.Layer):
...@@ -479,6 +516,17 @@ class TFViTMAEAttention(tf.keras.layers.Layer): ...@@ -479,6 +516,17 @@ class TFViTMAEAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->ViTMAE # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->ViTMAE
class TFViTMAEIntermediate(tf.keras.layers.Layer): class TFViTMAEIntermediate(tf.keras.layers.Layer):
...@@ -493,6 +541,7 @@ class TFViTMAEIntermediate(tf.keras.layers.Layer): ...@@ -493,6 +541,7 @@ class TFViTMAEIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -500,6 +549,14 @@ class TFViTMAEIntermediate(tf.keras.layers.Layer): ...@@ -500,6 +549,14 @@ class TFViTMAEIntermediate(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->ViTMAE # Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->ViTMAE
class TFViTMAEOutput(tf.keras.layers.Layer): class TFViTMAEOutput(tf.keras.layers.Layer):
...@@ -510,6 +567,7 @@ class TFViTMAEOutput(tf.keras.layers.Layer): ...@@ -510,6 +567,7 @@ class TFViTMAEOutput(tf.keras.layers.Layer):
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -518,6 +576,14 @@ class TFViTMAEOutput(tf.keras.layers.Layer): ...@@ -518,6 +576,14 @@ class TFViTMAEOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTLayer with ViT->ViTMAE # Copied from transformers.models.vit.modeling_tf_vit.TFViTLayer with ViT->ViTMAE
class TFViTMAELayer(tf.keras.layers.Layer): class TFViTMAELayer(tf.keras.layers.Layer):
...@@ -536,6 +602,7 @@ class TFViTMAELayer(tf.keras.layers.Layer): ...@@ -536,6 +602,7 @@ class TFViTMAELayer(tf.keras.layers.Layer):
self.layernorm_after = tf.keras.layers.LayerNormalization( self.layernorm_after = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="layernorm_after" epsilon=config.layer_norm_eps, name="layernorm_after"
) )
self.config = config
def call( def call(
self, self,
...@@ -569,6 +636,26 @@ class TFViTMAELayer(tf.keras.layers.Layer): ...@@ -569,6 +636,26 @@ class TFViTMAELayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "vit_output", None) is not None:
with tf.name_scope(self.vit_output.name):
self.vit_output.build(None)
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.config.hidden_size])
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->ViTMAE # Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->ViTMAE
class TFViTMAEEncoder(tf.keras.layers.Layer): class TFViTMAEEncoder(tf.keras.layers.Layer):
...@@ -615,6 +702,15 @@ class TFViTMAEEncoder(tf.keras.layers.Layer): ...@@ -615,6 +702,15 @@ class TFViTMAEEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFViTMAEMainLayer(tf.keras.layers.Layer): class TFViTMAEMainLayer(tf.keras.layers.Layer):
...@@ -687,6 +783,20 @@ class TFViTMAEMainLayer(tf.keras.layers.Layer): ...@@ -687,6 +783,20 @@ class TFViTMAEMainLayer(tf.keras.layers.Layer):
attentions=encoder_outputs.attentions, attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.hidden_size])
class TFViTMAEPreTrainedModel(TFPreTrainedModel): class TFViTMAEPreTrainedModel(TFPreTrainedModel):
""" """
...@@ -829,6 +939,14 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel): ...@@ -829,6 +939,14 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vit", None) is not None:
with tf.name_scope(self.vit.name):
self.vit.build(None)
class TFViTMAEDecoder(tf.keras.layers.Layer): class TFViTMAEDecoder(tf.keras.layers.Layer):
def __init__(self, config, num_patches, **kwargs): def __init__(self, config, num_patches, **kwargs):
...@@ -853,7 +971,7 @@ class TFViTMAEDecoder(tf.keras.layers.Layer): ...@@ -853,7 +971,7 @@ class TFViTMAEDecoder(tf.keras.layers.Layer):
self.config = config self.config = config
self.num_patches = num_patches self.num_patches = num_patches
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
self.mask_token = self.add_weight( self.mask_token = self.add_weight(
shape=(1, 1, self.config.decoder_hidden_size), shape=(1, 1, self.config.decoder_hidden_size),
initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
...@@ -873,7 +991,22 @@ class TFViTMAEDecoder(tf.keras.layers.Layer): ...@@ -873,7 +991,22 @@ class TFViTMAEDecoder(tf.keras.layers.Layer):
)[None, ...] )[None, ...]
self.decoder_pos_embed.assign(decoder_pos_embed) self.decoder_pos_embed.assign(decoder_pos_embed)
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "decoder_embed", None) is not None:
with tf.name_scope(self.decoder_embed.name):
self.decoder_embed.build([None, None, self.config.hidden_size])
if getattr(self, "decoder_norm", None) is not None:
with tf.name_scope(self.decoder_norm.name):
self.decoder_norm.build([None, None, self.config.decoder_hidden_size])
if getattr(self, "decoder_pred", None) is not None:
with tf.name_scope(self.decoder_pred.name):
self.decoder_pred.build([None, None, self.config.decoder_hidden_size])
if getattr(self, "decoder_layers", None) is not None:
for layer in self.decoder_layers:
with tf.name_scope(layer.name):
layer.build(None)
def call( def call(
self, self,
...@@ -1128,3 +1261,14 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel): ...@@ -1128,3 +1261,14 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "vit", None) is not None:
with tf.name_scope(self.vit.name):
self.vit.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
...@@ -450,11 +450,6 @@ class TFWav2Vec2WeightNormConv1D(tf.keras.layers.Conv1D): ...@@ -450,11 +450,6 @@ class TFWav2Vec2WeightNormConv1D(tf.keras.layers.Conv1D):
def build(self, input_shape): def build(self, input_shape):
if not self.built: if not self.built:
input_shape = input_shape.as_list()
# If a specific input shape is passed in, we need to modify it to account for padding
# Not necessary if those portions of the shape are None
if input_shape[-2] is not None:
input_shape[-2] += self.explicit_padding * 2
super().build(input_shape) super().build(input_shape)
self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True) self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
...@@ -502,6 +497,14 @@ class TFWav2Vec2NoLayerNormConvLayer(tf.keras.layers.Layer): ...@@ -502,6 +497,14 @@ class TFWav2Vec2NoLayerNormConvLayer(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states) hidden_states = self.activation(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer): class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None: def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None:
...@@ -525,6 +528,17 @@ class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer): ...@@ -525,6 +528,17 @@ class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states) hidden_states = self.activation(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.out_conv_dim])
class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer): class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None: def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None:
...@@ -550,6 +564,17 @@ class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer): ...@@ -550,6 +564,17 @@ class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states) hidden_states = self.activation(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.in_conv_dim])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.out_conv_dim])
class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer): class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None: def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
...@@ -563,6 +588,7 @@ class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer): ...@@ -563,6 +588,7 @@ class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer):
) )
self.padding = TFWav2Vec2SamePadLayer(config.num_conv_pos_embeddings) self.padding = TFWav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
self.activation = get_tf_activation(config.feat_extract_activation) self.activation = get_tf_activation(config.feat_extract_activation)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.conv(hidden_states) hidden_states = self.conv(hidden_states)
...@@ -570,6 +596,14 @@ class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer): ...@@ -570,6 +596,14 @@ class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer):
hidden_states = self.activation(hidden_states) hidden_states = self.activation(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, self.config.hidden_size])
class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer): class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer):
def __init__(self, num_conv_pos_embeddings, **kwargs): def __init__(self, num_conv_pos_embeddings, **kwargs):
...@@ -608,6 +642,15 @@ class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer): ...@@ -608,6 +642,15 @@ class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer):
hidden_states = conv_layer(hidden_states) hidden_states = conv_layer(hidden_states)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv_layers", None) is not None:
for conv_layer in self.conv_layers:
with tf.name_scope(conv_layer.name):
conv_layer.build(None)
class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder): class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -632,6 +675,7 @@ class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer): ...@@ -632,6 +675,7 @@ class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
name="projection", name="projection",
) )
self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout) self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
norm_hidden_states = self.layer_norm(hidden_states) norm_hidden_states = self.layer_norm(hidden_states)
...@@ -639,6 +683,17 @@ class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer): ...@@ -639,6 +683,17 @@ class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.dropout(hidden_states, training=training)
return hidden_states, norm_hidden_states return hidden_states, norm_hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.conv_dim[-1]])
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.config.conv_dim[-1]])
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFWav2Vec2 # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFWav2Vec2
class TFWav2Vec2Attention(tf.keras.layers.Layer): class TFWav2Vec2Attention(tf.keras.layers.Layer):
...@@ -793,6 +848,23 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer): ...@@ -793,6 +848,23 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFWav2Vec2FeedForward(tf.keras.layers.Layer): class TFWav2Vec2FeedForward(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs): def __init__(self, config: Wav2Vec2Config, **kwargs):
...@@ -815,6 +887,7 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer): ...@@ -815,6 +887,7 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer):
name="output_dense", name="output_dense",
) )
self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.intermediate_dense(hidden_states) hidden_states = self.intermediate_dense(hidden_states)
...@@ -825,6 +898,17 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer): ...@@ -825,6 +898,17 @@ class TFWav2Vec2FeedForward(tf.keras.layers.Layer):
hidden_states = self.output_dropout(hidden_states, training=training) hidden_states = self.output_dropout(hidden_states, training=training)
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "intermediate_dense", None) is not None:
with tf.name_scope(self.intermediate_dense.name):
self.intermediate_dense.build([None, None, self.config.hidden_size])
if getattr(self, "output_dense", None) is not None:
with tf.name_scope(self.output_dense.name):
self.output_dense.build([None, None, self.config.intermediate_size])
class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs): def __init__(self, config: Wav2Vec2Config, **kwargs):
...@@ -842,6 +926,7 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): ...@@ -842,6 +926,7 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer):
self.final_layer_norm = tf.keras.layers.LayerNormalization( self.final_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="final_layer_norm" epsilon=config.layer_norm_eps, name="final_layer_norm"
) )
self.config = config
def call( def call(
self, self,
...@@ -868,6 +953,23 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): ...@@ -868,6 +953,23 @@ class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "feed_forward", None) is not None:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs): def __init__(self, config: Wav2Vec2Config, **kwargs):
...@@ -885,6 +987,7 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): ...@@ -885,6 +987,7 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer):
self.final_layer_norm = tf.keras.layers.LayerNormalization( self.final_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="final_layer_norm" epsilon=config.layer_norm_eps, name="final_layer_norm"
) )
self.config = config
def call( def call(
self, self,
...@@ -909,6 +1012,23 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): ...@@ -909,6 +1012,23 @@ class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "feed_forward", None) is not None:
with tf.name_scope(self.feed_forward.name):
self.feed_forward.build(None)
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.config.hidden_size])
class TFWav2Vec2Encoder(tf.keras.layers.Layer): class TFWav2Vec2Encoder(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs): def __init__(self, config: Wav2Vec2Config, **kwargs):
...@@ -974,6 +1094,21 @@ class TFWav2Vec2Encoder(tf.keras.layers.Layer): ...@@ -974,6 +1094,21 @@ class TFWav2Vec2Encoder(tf.keras.layers.Layer):
attentions=all_self_attentions, attentions=all_self_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "pos_conv_embed", None) is not None:
with tf.name_scope(self.pos_conv_embed.name):
self.pos_conv_embed.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer): class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs): def __init__(self, config: Wav2Vec2Config, **kwargs):
...@@ -1041,6 +1176,21 @@ class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer): ...@@ -1041,6 +1176,21 @@ class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer):
attentions=all_self_attentions, attentions=all_self_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "pos_conv_embed", None) is not None:
with tf.name_scope(self.pos_conv_embed.name):
self.pos_conv_embed.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFWav2Vec2MainLayer(tf.keras.layers.Layer): class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
...@@ -1057,12 +1207,23 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer): ...@@ -1057,12 +1207,23 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
else: else:
self.encoder = TFWav2Vec2Encoder(config, name="encoder") self.encoder = TFWav2Vec2Encoder(config, name="encoder")
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
self.masked_spec_embed = self.add_weight( self.masked_spec_embed = self.add_weight(
shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed" shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed"
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "feature_extractor", None) is not None:
with tf.name_scope(self.feature_extractor.name):
self.feature_extractor.build(None)
if getattr(self, "feature_projection", None) is not None:
with tf.name_scope(self.feature_projection.name):
self.feature_projection.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor): def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
""" """
...@@ -1419,6 +1580,14 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel): ...@@ -1419,6 +1580,14 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wav2vec2", None) is not None:
with tf.name_scope(self.wav2vec2.name):
self.wav2vec2.build(None)
@add_start_docstrings( @add_start_docstrings(
"""TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", """TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
...@@ -1431,6 +1600,9 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): ...@@ -1431,6 +1600,9 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2") self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2")
self.dropout = tf.keras.layers.Dropout(config.final_dropout) self.dropout = tf.keras.layers.Dropout(config.final_dropout)
self.lm_head = tf.keras.layers.Dense(config.vocab_size, name="lm_head") self.lm_head = tf.keras.layers.Dense(config.vocab_size, name="lm_head")
self.output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
...@@ -1572,6 +1744,17 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): ...@@ -1572,6 +1744,17 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wav2vec2", None) is not None:
with tf.name_scope(self.wav2vec2.name):
self.wav2vec2.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.output_hidden_size])
class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel): class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
...@@ -1669,3 +1852,17 @@ class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel): ...@@ -1669,3 +1852,17 @@ class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel):
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wav2vec2", None) is not None:
with tf.name_scope(self.wav2vec2.name):
self.wav2vec2.build(None)
if getattr(self, "projector", None) is not None:
with tf.name_scope(self.projector.name):
self.projector.build([None, None, self.config.hidden_size])
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.classifier_proj_size])
...@@ -313,6 +313,23 @@ class TFWhisperAttention(tf.keras.layers.Layer): ...@@ -313,6 +313,23 @@ class TFWhisperAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextEncoderLayer with Speech2Text->Whisper # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextEncoderLayer with Speech2Text->Whisper
class TFWhisperEncoderLayer(tf.keras.layers.Layer): class TFWhisperEncoderLayer(tf.keras.layers.Layer):
...@@ -329,6 +346,7 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer): ...@@ -329,6 +346,7 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False
...@@ -369,6 +387,26 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer): ...@@ -369,6 +387,26 @@ class TFWhisperEncoderLayer(tf.keras.layers.Layer):
return hidden_states, self_attn_weights return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
# Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextDecoderLayer with Speech2Text->Whisper # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextDecoderLayer with Speech2Text->Whisper
class TFWhisperDecoderLayer(tf.keras.layers.Layer): class TFWhisperDecoderLayer(tf.keras.layers.Layer):
...@@ -399,6 +437,7 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer): ...@@ -399,6 +437,7 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call( def call(
self, self,
...@@ -482,6 +521,32 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer): ...@@ -482,6 +521,32 @@ class TFWhisperDecoderLayer(tf.keras.layers.Layer):
present_key_value, present_key_value,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFWhisperPreTrainedModel(TFPreTrainedModel): class TFWhisperPreTrainedModel(TFPreTrainedModel):
config_class = WhisperConfig config_class = WhisperConfig
...@@ -749,6 +814,27 @@ class TFWhisperEncoder(tf.keras.layers.Layer): ...@@ -749,6 +814,27 @@ class TFWhisperEncoder(tf.keras.layers.Layer):
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv1", None) is not None:
with tf.name_scope(self.conv1.name):
self.conv1.build([None, None, self.num_mel_bins])
if getattr(self, "conv2", None) is not None:
with tf.name_scope(self.conv2.name):
self.conv2.build([None, None, self.embed_dim])
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "encoder_layers", None) is not None:
for layer in self.encoder_layers:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
class TFWhisperDecoder(tf.keras.layers.Layer): class TFWhisperDecoder(tf.keras.layers.Layer):
...@@ -988,6 +1074,24 @@ class TFWhisperDecoder(tf.keras.layers.Layer): ...@@ -988,6 +1074,24 @@ class TFWhisperDecoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions, cross_attentions=all_cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "decoder_layers", None) is not None:
for layer in self.decoder_layers:
with tf.name_scope(layer.name):
layer.build(None)
@add_start_docstrings( @add_start_docstrings(
"The bare Whisper Model outputting raw hidden-states without any specific head on top.", "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
...@@ -1111,6 +1215,17 @@ class TFWhisperMainLayer(tf.keras.layers.Layer): ...@@ -1111,6 +1215,17 @@ class TFWhisperMainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions, encoder_attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings( @add_start_docstrings(
"The bare Whisper Model outputting raw hidden-states without any specific head on top.", "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
...@@ -1219,6 +1334,14 @@ class TFWhisperModel(TFWhisperPreTrainedModel): ...@@ -1219,6 +1334,14 @@ class TFWhisperModel(TFWhisperPreTrainedModel):
encoder_attentions=enc_attns, encoder_attentions=enc_attns,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
@add_start_docstrings( @add_start_docstrings(
"The Whisper Model with a language modeling head. Can be used for automatic speech recognition.", "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
...@@ -1630,3 +1753,11 @@ class TFWhisperForConditionalGeneration(TFWhisperPreTrainedModel, TFCausalLangua ...@@ -1630,3 +1753,11 @@ class TFWhisperForConditionalGeneration(TFWhisperPreTrainedModel, TFCausalLangua
"decoder_attention_mask": decoder_attention_mask, "decoder_attention_mask": decoder_attention_mask,
"decoder_position_ids": decoder_position_ids, "decoder_position_ids": decoder_position_ids,
} }
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
...@@ -301,6 +301,23 @@ class TFXGLMAttention(tf.keras.layers.Layer): ...@@ -301,6 +301,23 @@ class TFXGLMAttention(tf.keras.layers.Layer):
return attn_output, attn_weights, past_key_value return attn_output, attn_weights, past_key_value
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFXGLMDecoderLayer(tf.keras.layers.Layer): class TFXGLMDecoderLayer(tf.keras.layers.Layer):
def __init__(self, config: XGLMConfig, **kwargs: Any) -> None: def __init__(self, config: XGLMConfig, **kwargs: Any) -> None:
...@@ -333,6 +350,7 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer): ...@@ -333,6 +350,7 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer):
self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call
def call( def call(
...@@ -415,6 +433,32 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer): ...@@ -415,6 +433,32 @@ class TFXGLMDecoderLayer(tf.keras.layers.Layer):
present_key_value, present_key_value,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
@keras_serializable @keras_serializable
class TFXGLMMainLayer(tf.keras.layers.Layer): class TFXGLMMainLayer(tf.keras.layers.Layer):
...@@ -609,6 +653,21 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): ...@@ -609,6 +653,21 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions, cross_attentions=all_cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFXGLMPreTrainedModel(TFPreTrainedModel): class TFXGLMPreTrainedModel(TFPreTrainedModel):
config_class = XGLMConfig config_class = XGLMConfig
...@@ -792,6 +851,14 @@ class TFXGLMModel(TFXGLMPreTrainedModel): ...@@ -792,6 +851,14 @@ class TFXGLMModel(TFXGLMPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -822,6 +889,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -822,6 +889,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
kernel_initializer=get_initializer(config.init_std), kernel_initializer=get_initializer(config.init_std),
name="lm_head", name="lm_head",
) )
self.config = config
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head return self.lm_head
...@@ -925,6 +993,17 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -925,6 +993,17 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
cross_attentions=outputs.cross_attentions, cross_attentions=outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.config.hidden_size])
def tf_to_pt_weight_rename(self, tf_weight): def tf_to_pt_weight_rename(self, tf_weight):
if tf_weight == "lm_head.weight": if tf_weight == "lm_head.weight":
return tf_weight, "model.embed_tokens.weight" return tf_weight, "model.embed_tokens.weight"
......
...@@ -132,6 +132,7 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer): ...@@ -132,6 +132,7 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer):
self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.pruned_heads = set() self.pruned_heads = set()
self.dim = dim
def prune_heads(self, heads): def prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
...@@ -206,6 +207,23 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer): ...@@ -206,6 +207,23 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_lin", None) is not None:
with tf.name_scope(self.q_lin.name):
self.q_lin.build([None, None, self.dim])
if getattr(self, "k_lin", None) is not None:
with tf.name_scope(self.k_lin.name):
self.k_lin.build([None, None, self.dim])
if getattr(self, "v_lin", None) is not None:
with tf.name_scope(self.v_lin.name):
self.v_lin.build([None, None, self.dim])
if getattr(self, "out_lin", None) is not None:
with tf.name_scope(self.out_lin.name):
self.out_lin.build([None, None, self.dim])
class TFXLMTransformerFFN(tf.keras.layers.Layer): class TFXLMTransformerFFN(tf.keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
...@@ -215,6 +233,8 @@ class TFXLMTransformerFFN(tf.keras.layers.Layer): ...@@ -215,6 +233,8 @@ class TFXLMTransformerFFN(tf.keras.layers.Layer):
self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu") self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
self.in_dim = in_dim
self.dim_hidden = dim_hidden
def call(self, input, training=False): def call(self, input, training=False):
x = self.lin1(input) x = self.lin1(input)
...@@ -224,6 +244,17 @@ class TFXLMTransformerFFN(tf.keras.layers.Layer): ...@@ -224,6 +244,17 @@ class TFXLMTransformerFFN(tf.keras.layers.Layer):
return x return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.in_dim])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.dim_hidden])
@keras_serializable @keras_serializable
class TFXLMMainLayer(tf.keras.layers.Layer): class TFXLMMainLayer(tf.keras.layers.Layer):
...@@ -316,7 +347,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -316,7 +347,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
if self.attentions[int(layer)].n_heads == config.n_heads: if self.attentions[int(layer)].n_heads == config.n_heads:
self.prune_heads({int(layer): list(map(int, heads))}) self.prune_heads({int(layer): list(map(int, heads))})
def build(self, input_shape): def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope("position_embeddings"): with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight( self.position_embeddings = self.add_weight(
name="embeddings", name="embeddings",
...@@ -331,8 +365,24 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -331,8 +365,24 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
shape=[self.n_langs, self.dim], shape=[self.n_langs, self.dim],
initializer=get_initializer(self.embed_init_std), initializer=get_initializer(self.embed_init_std),
) )
if getattr(self, "embeddings", None) is not None:
super().build(input_shape) with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "layer_norm_emb", None) is not None:
with tf.name_scope(self.layer_norm_emb.name):
self.layer_norm_emb.build([None, None, self.dim])
for layer in self.attentions:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm1:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
for layer in self.ffns:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm2:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings
...@@ -734,6 +784,14 @@ class TFXLMModel(TFXLMPreTrainedModel): ...@@ -734,6 +784,14 @@ class TFXLMModel(TFXLMPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFXLMPredLayer(tf.keras.layers.Layer): class TFXLMPredLayer(tf.keras.layers.Layer):
""" """
...@@ -871,6 +929,17 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -871,6 +929,17 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "pred_layer", None) is not None:
with tf.name_scope(self.pred_layer.name):
self.pred_layer.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -949,6 +1018,17 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ...@@ -949,6 +1018,17 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -966,6 +1046,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -966,6 +1046,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
self.logits_proj = tf.keras.layers.Dense( self.logits_proj = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
) )
self.config = config
@property @property
def dummy_inputs(self): def dummy_inputs(self):
...@@ -1068,6 +1149,20 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1068,6 +1149,20 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.num_labels])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1086,6 +1181,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1086,6 +1181,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1148,6 +1244,17 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1148,6 +1244,17 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1163,6 +1270,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1163,6 +1270,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1238,3 +1346,14 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1238,3 +1346,14 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
hidden_states=transformer_outputs.hidden_states, hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
...@@ -178,7 +178,7 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -178,7 +178,7 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape: tf.TensorShape): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
self.weight = self.add_weight( self.weight = self.add_weight(
name="weight", name="weight",
...@@ -200,7 +200,12 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -200,7 +200,12 @@ class TFXLMRobertaEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range), initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
""" """
...@@ -273,6 +278,7 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer): ...@@ -273,6 +278,7 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer):
activation="tanh", activation="tanh",
name="dense", name="dense",
) )
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# We "pool" the model by simply taking the hidden state corresponding # We "pool" the model by simply taking the hidden state corresponding
...@@ -282,6 +288,14 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer): ...@@ -282,6 +288,14 @@ class TFXLMRobertaPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta
class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
...@@ -311,6 +325,7 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -311,6 +325,7 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
...@@ -400,6 +415,20 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -400,6 +415,20 @@ class TFXLMRobertaSelfAttention(tf.keras.layers.Layer):
outputs = outputs + (past_key_value,) outputs = outputs + (past_key_value,)
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta
class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
...@@ -411,6 +440,7 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): ...@@ -411,6 +440,7 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -419,6 +449,17 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): ...@@ -419,6 +449,17 @@ class TFXLMRobertaSelfOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta
class TFXLMRobertaAttention(tf.keras.layers.Layer): class TFXLMRobertaAttention(tf.keras.layers.Layer):
...@@ -460,6 +501,17 @@ class TFXLMRobertaAttention(tf.keras.layers.Layer): ...@@ -460,6 +501,17 @@ class TFXLMRobertaAttention(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta
class TFXLMRobertaIntermediate(tf.keras.layers.Layer): class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
...@@ -474,6 +526,7 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer): ...@@ -474,6 +526,7 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
self.intermediate_act_fn = get_tf_activation(config.hidden_act) self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.intermediate_act_fn = config.hidden_act self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -481,6 +534,14 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer): ...@@ -481,6 +534,14 @@ class TFXLMRobertaIntermediate(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta
class TFXLMRobertaOutput(tf.keras.layers.Layer): class TFXLMRobertaOutput(tf.keras.layers.Layer):
...@@ -492,6 +553,7 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer): ...@@ -492,6 +553,7 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer):
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dense(inputs=hidden_states)
...@@ -500,6 +562,17 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer): ...@@ -500,6 +562,17 @@ class TFXLMRobertaOutput(tf.keras.layers.Layer):
return hidden_states return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta
class TFXLMRobertaLayer(tf.keras.layers.Layer): class TFXLMRobertaLayer(tf.keras.layers.Layer):
...@@ -587,6 +660,23 @@ class TFXLMRobertaLayer(tf.keras.layers.Layer): ...@@ -587,6 +660,23 @@ class TFXLMRobertaLayer(tf.keras.layers.Layer):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta
class TFXLMRobertaEncoder(tf.keras.layers.Layer): class TFXLMRobertaEncoder(tf.keras.layers.Layer):
...@@ -657,6 +747,15 @@ class TFXLMRobertaEncoder(tf.keras.layers.Layer): ...@@ -657,6 +747,15 @@ class TFXLMRobertaEncoder(tf.keras.layers.Layer):
cross_attentions=all_cross_attentions, cross_attentions=all_cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable @keras_serializable
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta
...@@ -855,6 +954,20 @@ class TFXLMRobertaMainLayer(tf.keras.layers.Layer): ...@@ -855,6 +954,20 @@ class TFXLMRobertaMainLayer(tf.keras.layers.Layer):
cross_attentions=encoder_outputs.cross_attentions, cross_attentions=encoder_outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta
class TFXLMRobertaPreTrainedModel(TFPreTrainedModel): class TFXLMRobertaPreTrainedModel(TFPreTrainedModel):
...@@ -940,6 +1053,14 @@ class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel): ...@@ -940,6 +1053,14 @@ class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta
class TFXLMRobertaLMHead(tf.keras.layers.Layer): class TFXLMRobertaLMHead(tf.keras.layers.Layer):
...@@ -960,10 +1081,18 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer): ...@@ -960,10 +1081,18 @@ class TFXLMRobertaLMHead(tf.keras.layers.Layer):
# an output-only bias for each token. # an output-only bias for each token.
self.decoder = input_embeddings self.decoder = input_embeddings
def build(self, input_shape): def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self): def get_output_embeddings(self):
return self.decoder return self.decoder
...@@ -1072,6 +1201,17 @@ class TFXLMRobertaForMaskedLM(TFXLMRobertaPreTrainedModel, TFMaskedLanguageModel ...@@ -1072,6 +1201,17 @@ class TFXLMRobertaForMaskedLM(TFXLMRobertaPreTrainedModel, TFMaskedLanguageModel
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
@add_start_docstrings( @add_start_docstrings(
"XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.", "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
...@@ -1199,6 +1339,17 @@ class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModel ...@@ -1199,6 +1339,17 @@ class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModel
cross_attentions=outputs.cross_attentions, cross_attentions=outputs.cross_attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta
class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
...@@ -1219,6 +1370,7 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): ...@@ -1219,6 +1370,7 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
self.out_proj = tf.keras.layers.Dense( self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
self.config = config
def call(self, features, training=False): def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = features[:, 0, :] # take <s> token (equiv. to [CLS])
...@@ -1228,6 +1380,17 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): ...@@ -1228,6 +1380,17 @@ class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
x = self.out_proj(x) x = self.out_proj(x)
return x return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1305,6 +1468,17 @@ class TFXLMRobertaForSequenceClassification(TFXLMRobertaPreTrainedModel, TFSeque ...@@ -1305,6 +1468,17 @@ class TFXLMRobertaForSequenceClassification(TFXLMRobertaPreTrainedModel, TFSeque
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1327,6 +1501,7 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic ...@@ -1327,6 +1501,7 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
...@@ -1398,6 +1573,17 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic ...@@ -1398,6 +1573,17 @@ class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoic
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1424,6 +1610,7 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla ...@@ -1424,6 +1610,7 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1482,6 +1669,17 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla ...@@ -1482,6 +1669,17 @@ class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenCla
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1503,6 +1701,7 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn ...@@ -1503,6 +1701,7 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1574,3 +1773,14 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn ...@@ -1574,3 +1773,14 @@ class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAn
hidden_states=outputs.hidden_states, hidden_states=outputs.hidden_states,
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "roberta", None) is not None:
with tf.name_scope(self.roberta.name):
self.roberta.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
...@@ -85,8 +85,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -85,8 +85,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
self.config = config
def build(self, input_shape): def build(self, input_shape=None):
initializer = get_initializer(self.initializer_range) initializer = get_initializer(self.initializer_range)
self.q = self.add_weight( self.q = self.add_weight(
shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q" shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
...@@ -115,7 +116,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -115,7 +116,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self.seg_embed = self.add_weight( self.seg_embed = self.add_weight(
shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
) )
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
def prune_heads(self, heads): def prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
...@@ -344,6 +351,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): ...@@ -344,6 +351,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
self.activation_function = get_tf_activation(config.ff_activation) self.activation_function = get_tf_activation(config.ff_activation)
else: else:
self.activation_function = config.ff_activation self.activation_function = config.ff_activation
self.config = config
def call(self, inp, training=False): def call(self, inp, training=False):
output = inp output = inp
...@@ -355,6 +363,20 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): ...@@ -355,6 +363,20 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
output = self.layer_norm(output + inp) output = self.layer_norm(output + inp)
return output return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
if getattr(self, "layer_1", None) is not None:
with tf.name_scope(self.layer_1.name):
self.layer_1.build([None, None, self.config.d_model])
if getattr(self, "layer_2", None) is not None:
with tf.name_scope(self.layer_2.name):
self.layer_2.build([None, None, self.config.d_inner])
class TFXLNetLayer(tf.keras.layers.Layer): class TFXLNetLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -399,6 +421,17 @@ class TFXLNetLayer(tf.keras.layers.Layer): ...@@ -399,6 +421,17 @@ class TFXLNetLayer(tf.keras.layers.Layer):
outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rel_attn", None) is not None:
with tf.name_scope(self.rel_attn.name):
self.rel_attn.build(None)
if getattr(self, "ff", None) is not None:
with tf.name_scope(self.ff.name):
self.ff.build(None)
class TFXLNetLMHead(tf.keras.layers.Layer): class TFXLNetLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
...@@ -471,12 +504,22 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -471,12 +504,22 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self.word_embedding.weight = value self.word_embedding.weight = value
self.word_embedding.vocab_size = shape_list(value)[0] self.word_embedding.vocab_size = shape_list(value)[0]
def build(self, input_shape): def build(self, input_shape=None):
initializer = get_initializer(self.initializer_range) initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight( self.mask_emb = self.add_weight(
shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb" shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
) )
super().build(input_shape)
if self.built:
return
self.built = True
if getattr(self, "word_embedding", None) is not None:
with tf.name_scope(self.word_embedding.name):
self.word_embedding.build(None)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError raise NotImplementedError
...@@ -1177,6 +1220,14 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -1177,6 +1220,14 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
return outputs return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1336,6 +1387,17 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1336,6 +1387,17 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "lm_loss", None) is not None:
with tf.name_scope(self.lm_loss.name):
self.lm_loss.build(None)
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1356,6 +1418,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1356,6 +1418,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
self.logits_proj = tf.keras.layers.Dense( self.logits_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1423,6 +1486,20 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1423,6 +1486,20 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.d_model])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1442,6 +1519,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1442,6 +1519,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
self.logits_proj = tf.keras.layers.Dense( self.logits_proj = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
...@@ -1524,6 +1602,20 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1524,6 +1602,20 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.d_model])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1541,6 +1633,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1541,6 +1633,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1604,6 +1697,17 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1604,6 +1697,17 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings( @add_start_docstrings(
""" """
...@@ -1619,6 +1723,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1619,6 +1723,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
...@@ -1697,3 +1802,14 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1697,3 +1802,14 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
hidden_states=transformer_outputs.hidden_states, hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
...@@ -2161,16 +2161,8 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): ...@@ -2161,16 +2161,8 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None: if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope` inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape) embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos hidden_states = inputs_embeds + embed_pos
...@@ -2359,16 +2351,8 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): ...@@ -2359,16 +2351,8 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer):
positions = self.embed_positions(input_shape, past_key_values_length) positions = self.embed_positions(input_shape, past_key_values_length)
if inputs_embeds is None: if inputs_embeds is None:
# if `self.embed_tokens.load_weight_prefix` is set, runs the embedding operation with the correct name check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
# scope, so that its weights are registered with the desired name for loading/storing. When `tf.name_scope` inputs_embeds = self.embed_tokens(input_ids)
# is used with a name ending in `/`, that name replaces the current name scope.
# (embeddings with tf.name_scope: self.embed_tokens.load_weight_prefix/self.embed_tokens.name/embeddings:0)
context = []
if hasattr(self.embed_tokens, "load_weight_prefix"):
context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
with ContextManagers(context):
check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
inputs_embeds = self.embed_tokens(input_ids)
hidden_states = inputs_embeds hidden_states = inputs_embeds
...@@ -2578,6 +2562,13 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): ...@@ -2578,6 +2562,13 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer):
encoder_attentions=encoder_outputs.attentions, encoder_attentions=encoder_outputs.attentions,
) )
def build(self, input_shape=None):
# The shared/tied weights expect to be in the model base namespace
# Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
# the current one.
with tf.name_scope(self.shared.load_weight_prefix + '/' + self.shared.name + '/'):
self.shared.build(None)
@add_start_docstrings( @add_start_docstrings(
"The bare {{cookiecutter.uppercase_modelname}} Model outputting raw hidden-states without any specific head on top.", "The bare {{cookiecutter.uppercase_modelname}} Model outputting raw hidden-states without any specific head on top.",
......
...@@ -1071,9 +1071,9 @@ class TFEncoderDecoderModelSaveLoadTests(unittest.TestCase): ...@@ -1071,9 +1071,9 @@ class TFEncoderDecoderModelSaveLoadTests(unittest.TestCase):
# create two random BERT models for bert2bert & initialize weights (+cross_attention weights) # create two random BERT models for bert2bert & initialize weights (+cross_attention weights)
encoder = TFBertModel(config.encoder) encoder = TFBertModel(config.encoder)
encoder.build() encoder.build_in_name_scope()
decoder = TFBertLMHeadModel(config.decoder) decoder = TFBertLMHeadModel(config.decoder)
decoder.build() decoder.build_in_name_scope()
encoder_decoder_orig = TFEncoderDecoderModel(encoder=encoder, decoder=decoder) encoder_decoder_orig = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)
......
...@@ -180,7 +180,7 @@ class TFOPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) ...@@ -180,7 +180,7 @@ class TFOPTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
else: else:
# Here we build the word embeddings weights if not exists. # Here we build the word embeddings weights if not exists.
# And then we retry to get the attribute once built. # And then we retry to get the attribute once built.
model.build() model.build_in_name_scope()
if hasattr(embedding_layer, "weight"): if hasattr(embedding_layer, "weight"):
return embedding_layer.weight return embedding_layer.weight
else: else:
......
...@@ -729,9 +729,9 @@ class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase): ...@@ -729,9 +729,9 @@ class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase):
# create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights) # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
encoder = TFViTModel(config.encoder) encoder = TFViTModel(config.encoder)
encoder.build() encoder.build_in_name_scope()
decoder = TFGPT2LMHeadModel(config.decoder) decoder = TFGPT2LMHeadModel(config.decoder)
decoder.build() decoder.build_in_name_scope()
encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder) encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
......
...@@ -290,7 +290,7 @@ class TFWhisperModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC ...@@ -290,7 +290,7 @@ class TFWhisperModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestC
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
model.build() model.build_in_name_scope()
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, saved_model=False) model.save_pretrained(tmpdirname, saved_model=False)
......
...@@ -21,7 +21,7 @@ from transformers import ( ...@@ -21,7 +21,7 @@ from transformers import (
TFPreTrainedModel, TFPreTrainedModel,
pipeline, pipeline,
) )
from transformers.testing_utils import get_gpu_count, is_pipeline_test, require_tf, require_torch, slow, torch_device from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, slow, torch_device
from transformers.tokenization_utils import TruncationStrategy from transformers.tokenization_utils import TruncationStrategy
from .test_pipelines_common import ANY from .test_pipelines_common import ANY
...@@ -67,8 +67,8 @@ class SummarizationPipelineTests(unittest.TestCase): ...@@ -67,8 +67,8 @@ class SummarizationPipelineTests(unittest.TestCase):
# the embedding layer. # the embedding layer.
if not ( if not (
isinstance(model, TFPreTrainedModel) isinstance(model, TFPreTrainedModel)
and get_gpu_count() > 0
and len(summarizer.model.trainable_weights) > 0 and len(summarizer.model.trainable_weights) > 0
and "GPU" in summarizer.model.trainable_weights[0].device
): ):
with self.assertRaises(Exception): with self.assertRaises(Exception):
outputs = summarizer("This " * 1000) outputs = summarizer("This " * 1000)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment