Unverified Commit 14042d56 authored by Julien Plu's avatar Julien Plu Committed by GitHub
Browse files

New TF embeddings (cleaner and faster) (#9418)



* Create new embeddings + add to BERT

* Add Albert

* Add DistilBert

* Add Albert + Electra + Funnel

* Add Longformer + Lxmert

* Add last models

* Apply style

* Update the template

* Remove unused imports

* Rename attribute

* Import embeddings in their own model file

* Replace word_embeddings per weight

* fix naming

* Fix Albert

* Fix Albert

* Fix Longformer

* Fix Lxmert Mobilebert and MPNet

* Fix copy

* Fix template

* Update the get weights function

* Update src/transformers/modeling_tf_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/electra/modeling_tf_electra.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* address Sylvain's comments
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 12f0d7e8
...@@ -809,24 +809,28 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): ...@@ -809,24 +809,28 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
return model_embeds return model_embeds
def _get_word_embedding_weight(self, embedding_layer): def _get_word_embedding_weight(model, embedding_layer):
if hasattr(embedding_layer, "word_embeddings"): embeds = getattr(embedding_layer, "weight", None)
return embedding_layer.word_embeddings if embeds is not None:
elif hasattr(embedding_layer, "weight"): return embeds
return embedding_layer.weight
elif hasattr(embedding_layer, "decoder"): embeds = getattr(embedding_layer, "decoder", None)
return embedding_layer.decoder if embeds is not None:
else: return embeds
# Here we build the word embeddings weights if not exists.
# And then we retry to get the attribute once built. # The reason why the attributes don't exist might be
self(self.dummy_inputs) # because the model is not built, so retry getting
if hasattr(embedding_layer, "word_embeddings"): # the argument after building the model
return embedding_layer.word_embeddings model(model.dummy_inputs)
elif hasattr(embedding_layer, "weight"):
return embedding_layer.weight embeds = getattr(embedding_layer, "weight", None)
elif hasattr(embedding_layer, "decoder"): if embeds is not None:
return embedding_layer.decoder return embeds
else:
embeds = getattr(embedding_layer, "decoder", None)
if embeds is not None:
return embeds
return None return None
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
...@@ -1319,6 +1323,119 @@ class TFConv1D(tf.keras.layers.Layer): ...@@ -1319,6 +1323,119 @@ class TFConv1D(tf.keras.layers.Layer):
return x return x
class WordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.word_embeddings = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.word_embeddings, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
class PositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFSharedEmbeddings(tf.keras.layers.Layer): class TFSharedEmbeddings(tf.keras.layers.Layer):
r""" r"""
Construct shared token embeddings. Construct shared token embeddings.
......
...@@ -73,124 +73,178 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -73,124 +73,178 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
class TFAlbertEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
"""Construct the embeddings from word, position and token_type embeddings.""" class TFAlbertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
def __init__(self, config, **kwargs): self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.type_vocab_size = type_vocab_size
self.embedding_size = config.embedding_size self.hidden_size = hidden_size
self.initializer_range = config.initializer_range self.initializer_range = initializer_range
self.max_position_embeddings = config.max_position_embeddings
self.type_vocab_size = config.type_vocab_size def build(self, input_shape):
self.layer_norm_eps = config.layer_norm_eps self.token_type_embeddings = self.add_weight(
self.hidden_dropout_prob = config.hidden_dropout_prob name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
self.position_embeddings = tf.keras.layers.Embedding( initializer=get_initializer(initializer_range=self.initializer_range),
self.max_position_embeddings,
self.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range),
name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
self.type_vocab_size, super().build(input_shape=input_shape)
self.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="LayerNorm") return embeddings
self.dropout = tf.keras.layers.Dropout(self.hidden_dropout_prob)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFAlbertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.position_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.max_position_embeddings, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
def call( def get_config(self):
self, config = {
input_ids=None, "max_position_embeddings": self.max_position_embeddings,
position_ids=None, "hidden_size": self.hidden_size,
token_type_ids=None, "initializer_range": self.initializer_range,
inputs_embeds=None, }
mode="embedding", base_config = super().get_config()
training=False,
):
"""
Get token embeddings of inputs
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear"
Returns: def call(self, position_ids):
outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, input_shape = shape_list(tensor=position_ids)
embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, position_embeddings = self.position_embeddings[: input_shape[1], :]
vocab_size]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFAlbertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
Raises: self.word_embeddings = TFAlbertWordEmbeddings(
ValueError: if mode is not valid. vocab_size=config.vocab_size,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFAlbertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TFAlbertTokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
Shared weights logic adapted from # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
""" """
if mode == "embedding": Applies embedding based on inputs tensor.
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): Returns:
"""Applies embedding based on inputs tensor.""" final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings if position_ids is None:
embeddings = self.LayerNorm(embeddings) position_embeds = self.position_embeddings(position_ids=inputs_embeds)
embeddings = self.dropout(embeddings, training=training) else:
return embeddings position_embeds = self.position_embeddings(position_ids=position_ids)
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer
Args: token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
inputs: A float32 tensor with shape [batch_size, length, embedding_size final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
Returns: return final_embeddings
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.embedding_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
class TFAlbertSelfOutput(tf.keras.layers.Layer): class TFAlbertSelfOutput(tf.keras.layers.Layer):
...@@ -446,8 +500,9 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): ...@@ -446,8 +500,9 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
class TFAlbertMLMHead(tf.keras.layers.Layer): class TFAlbertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.vocab_size = config.vocab_size
self.embedding_size = config.embedding_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -474,7 +529,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -474,7 +529,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -486,10 +541,15 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -486,10 +541,15 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.activation(hidden_states) hidden_states = self.activation(inputs=hidden_states)
hidden_states = self.LayerNorm(hidden_states) hidden_states = self.LayerNorm(inputs=hidden_states)
hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
return hidden_states return hidden_states
...@@ -516,11 +576,11 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -516,11 +576,11 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
) )
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -844,7 +904,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): ...@@ -844,7 +904,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions")
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
def get_lm_head(self): def get_lm_head(self):
...@@ -964,7 +1024,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -964,7 +1024,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions")
def get_lm_head(self): def get_lm_head(self):
return self.predictions return self.predictions
......
...@@ -121,124 +121,174 @@ class TFBertPreTrainingLoss: ...@@ -121,124 +121,174 @@ class TFBertPreTrainingLoss:
return masked_lm_loss + next_sentence_loss return masked_lm_loss + next_sentence_loss
class TFBertEmbeddings(tf.keras.layers.Layer): class TFBertWordEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = vocab_size
self.hidden_size = config.hidden_size self.hidden_size = hidden_size
self.initializer_range = config.initializer_range self.initializer_range = initializer_range
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings, def build(self, input_shape):
config.hidden_size, self.weight = self.add_weight(
embeddings_initializer=get_initializer(self.initializer_range), name="weight",
name="position_embeddings", shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size, super().build(input_shape=input_shape)
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") return embeddings
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
class TFBertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.token_type_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.type_vocab_size, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape=input_shape)
def call( def get_config(self):
self, config = {
input_ids=None, "type_vocab_size": self.type_vocab_size,
position_ids=None, "hidden_size": self.hidden_size,
token_type_ids=None, "initializer_range": self.initializer_range,
inputs_embeds=None, }
mode="embedding", base_config = super().get_config()
training=False,
):
"""
Get token embeddings of inputs.
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: def call(self, token_type_ids):
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
vocab_size]. embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
Raises: embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
ValueError: if mode is not valid.
Shared weights logic adapted from return embeddings
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: class TFBertPositionEmbeddings(tf.keras.layers.Layer):
input_shape = shape_list(input_ids) def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
else: super().__init__(**kwargs)
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1] self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
if position_ids is None: def build(self, input_shape):
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
if token_type_ids is None: super().build(input_shape)
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None: def get_config(self):
inputs_embeds = tf.gather(self.word_embeddings, input_ids) config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) return dict(list(base_config.items()) + list(config.items()))
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
def _linear(self, inputs): return tf.broadcast_to(input=position_embeddings, shape=input_shape)
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]. class TFBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = TFBertWordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFBertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TFBertTokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) if input_ids is not None:
inputs_embeds = self.word_embeddings(input_ids=input_ids)
if token_type_ids is None:
input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
position_embeds = self.position_embeddings(position_ids=position_ids)
token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfAttention(tf.keras.layers.Layer):
...@@ -251,8 +301,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -251,8 +301,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -318,9 +368,9 @@ class TFBertSelfOutput(tf.keras.layers.Layer): ...@@ -318,9 +368,9 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -516,6 +566,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -516,6 +566,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.transform = TFBertPredictionHeadTransform(config, name="transform") self.transform = TFBertPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -531,7 +583,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -531,7 +583,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -542,9 +594,12 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -542,9 +594,12 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -583,21 +638,17 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -583,21 +638,17 @@ class TFBertMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.num_hidden_layers = config.num_hidden_layers
self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFBertEmbeddings(config, name="embeddings") self.embeddings = TFBertEmbeddings(config, name="embeddings")
self.encoder = TFBertEncoder(config, name="encoder") self.encoder = TFBertEncoder(config, name="encoder")
self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -682,7 +733,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -682,7 +733,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
if inputs["head_mask"] is not None: if inputs["head_mask"] is not None:
raise NotImplementedError raise NotImplementedError
else: else:
inputs["head_mask"] = [None] * self.num_hidden_layers inputs["head_mask"] = [None] * self.config.num_hidden_layers
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, embedding_output,
...@@ -931,7 +982,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): ...@@ -931,7 +982,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.nsp = TFBertNSPHead(config, name="nsp___cls") self.nsp = TFBertNSPHead(config, name="nsp___cls")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
...@@ -1055,7 +1106,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -1055,7 +1106,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
) )
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
...@@ -1158,7 +1209,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1158,7 +1209,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`") logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`")
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
......
...@@ -67,104 +67,128 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -67,104 +67,128 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
class TFEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
def __init__(self, config, **kwargs): class TFDistilBertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dim = config.dim self.vocab_size = vocab_size
self.initializer_range = config.initializer_range self.hidden_size = hidden_size
self.position_embeddings = tf.keras.layers.Embedding( self.initializer_range = initializer_range
config.max_position_embeddings,
config.dim, def build(self, input_shape):
embeddings_initializer=get_initializer(config.initializer_range), self.weight = self.add_weight(
name="position_embeddings", name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") super().build(input_shape=input_shape)
self.dropout = tf.keras.layers.Dropout(config.dropout)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.position_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.max_position_embeddings, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight(
"weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
) )
super().build(input_shape) super().build(input_shape)
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, mode="embedding", training=False): def get_config(self):
""" config = {
Get token embeddings of inputs. "max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: def call(self, position_ids):
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, input_shape = shape_list(tensor=position_ids)
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, position_embeddings = self.position_embeddings[: input_shape[1], :]
vocab_size].
Raises: return tf.broadcast_to(input=position_embeddings, shape=input_shape)
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, inputs_embeds, training=False): class TFEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dim = config.dim
self.initializer_range = config.initializer_range
self.word_embeddings = TFDistilBertWordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.dim,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFDistilBertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.dim,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.dropout)
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
""" """
Parameters: Applies embedding based on inputs tensor.
input_ids: tf.Tensor(bs, max_seq_length) The token ids to embed.
Returns: Returns:
tf.Tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
seq_length = shape_list(input_ids)[1] inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
seq_length = shape_list(inputs_embeds)[1]
if position_ids is None: if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
if inputs_embeds is None: position_embeds = self.position_embeddings(position_ids=position_ids)
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(
self.position_embeddings(position_ids), inputs_embeds.dtype
) # (bs, max_seq_length, dim)
embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim)
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.dim]) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
class TFMultiHeadSelfAttention(tf.keras.layers.Layer): class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
...@@ -397,11 +421,11 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -397,11 +421,11 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
self.transformer = TFTransformer(config, name="transformer") # Encoder self.transformer = TFTransformer(config, name="transformer") # Encoder
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = value.shape[0] self.embeddings.word_embeddings.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError raise NotImplementedError
...@@ -636,7 +660,9 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -636,7 +660,9 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
class TFDistilBertLMHead(tf.keras.layers.Layer): class TFDistilBertLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.dim = config.dim
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
...@@ -644,13 +670,14 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -644,13 +670,14 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -661,8 +688,12 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -661,8 +688,12 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -681,7 +712,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel ...@@ -681,7 +712,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
) )
self.act = get_tf_activation("gelu") self.act = get_tf_activation("gelu")
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") self.vocab_projector = TFDistilBertLMHead(
config, self.distilbert.embeddings.word_embeddings, name="vocab_projector"
)
def get_lm_head(self): def get_lm_head(self):
return self.vocab_projector return self.vocab_projector
......
...@@ -70,6 +70,122 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -70,6 +70,122 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFElectraWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFElectraPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra
class TFElectraSelfAttention(tf.keras.layers.Layer): class TFElectraSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -81,8 +197,8 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): ...@@ -81,8 +197,8 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -138,7 +254,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): ...@@ -138,7 +254,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput
class TFElectraSelfOutput(tf.keras.layers.Layer): class TFElectraSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -149,9 +265,9 @@ class TFElectraSelfOutput(tf.keras.layers.Layer): ...@@ -149,9 +265,9 @@ class TFElectraSelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -331,120 +447,56 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ...@@ -331,120 +447,56 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.word_embeddings = TFElectraWordEmbeddings(
self.embedding_size = config.embedding_size vocab_size=config.vocab_size,
self.initializer_range = config.initializer_range hidden_size=config.embedding_size,
self.position_embeddings = tf.keras.layers.Embedding( initializer_range=config.initializer_range,
config.max_position_embeddings, name="word_embeddings",
config.embedding_size, )
embeddings_initializer=get_initializer(self.initializer_range), self.position_embeddings = TFElectraPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFElectraTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.embedding_size, hidden_size=config.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings.call with Albert->Electra
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings._embedding
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype)
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs): if position_ids is None:
""" position_embeds = self.position_embeddings(position_ids=inputs_embeds)
Computes logits by running inputs through a linear layer. else:
position_embeds = self.position_embeddings(position_ids=position_ids)
Args: token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
inputs: A float32 tensor with shape [batch_size, length, hidden_size] final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
Returns: return final_embeddings
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.embedding_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer):
...@@ -508,11 +560,11 @@ class TFElectraMainLayer(tf.keras.layers.Layer): ...@@ -508,11 +560,11 @@ class TFElectraMainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -903,6 +955,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -903,6 +955,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.embedding_size = config.embedding_size
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
...@@ -914,7 +967,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -914,7 +967,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -924,9 +977,12 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -924,9 +977,12 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states, training=False): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -953,7 +1009,9 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -953,7 +1009,9 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
else: else:
self.activation = config.hidden_act self.activation = config.hidden_act
self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") self.generator_lm_head = TFElectraMaskedLMHead(
config, self.electra.embeddings.word_embeddings, name="generator_lm_head"
)
def get_lm_head(self): def get_lm_head(self):
return self.generator_lm_head return self.generator_lm_head
......
...@@ -74,89 +74,78 @@ TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -74,89 +74,78 @@ TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
INF = 1e6 INF = 1e6
class TFFunnelEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
"""Construct the embeddings from word embeddings.""" class TFFunnelWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.vocab_size = vocab_size
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.weight = self.add_weight(
with tf.name_scope("word_embeddings"): name="weight",
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size], shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range), initializer=get_initializer(initializer_range=self.initializer_range),
) )
super().build(input_shape)
def call( super().build(input_shape=input_shape)
self,
input_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
"""
Get token embeddings of inputs
Args: def get_config(self):
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) config = {
mode: string, a valid value is one of "embedding" and "linear" "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
Returns: return dict(list(base_config.items()) + list(config.items()))
outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length,
embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size]
Raises: def call(self, input_ids):
ValueError: if mode is not valid. flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
Shared weights logic adapted from embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, inputs_embeds, training=False): return embeddings
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
embeddings = self.layer_norm(inputs_embeds)
embeddings = self.dropout(embeddings, training=training)
return embeddings class TFFunnelEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def _linear(self, inputs): def __init__(self, config, **kwargs):
""" super().__init__(**kwargs)
Computes logits by running inputs through a linear layer
Args: self.word_embeddings = TFFunnelWordEmbeddings(
inputs: A float32 tensor with shape [batch_size, length, hidden_size vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout)
def call(self, input_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1] assert not (input_ids is not None and inputs_embeds is not None)
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) if input_ids is not None:
inputs_embeds = self.word_embeddings(input_ids=input_ids)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) final_embeddings = self.LayerNorm(inputs=inputs_embeds)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFFunnelAttentionStructure: class TFFunnelAttentionStructure:
...@@ -784,11 +773,11 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer): ...@@ -784,11 +773,11 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer):
self.encoder = TFFunnelEncoder(config, name="encoder") self.encoder = TFFunnelEncoder(config, name="encoder")
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
...@@ -870,11 +859,11 @@ class TFFunnelMainLayer(tf.keras.layers.Layer): ...@@ -870,11 +859,11 @@ class TFFunnelMainLayer(tf.keras.layers.Layer):
self.decoder = TFFunnelDecoder(config, name="decoder") self.decoder = TFFunnelDecoder(config, name="decoder")
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
...@@ -987,17 +976,19 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): ...@@ -987,17 +976,19 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -1008,8 +999,12 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): ...@@ -1008,8 +999,12 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states, training=False): def call(self, hidden_states, training=False):
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -1362,7 +1357,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -1362,7 +1357,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss)
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.funnel = TFFunnelMainLayer(config, name="funnel") self.funnel = TFFunnelMainLayer(config, name="funnel")
self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head") self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -415,14 +415,135 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se ...@@ -415,14 +415,135 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se
return attention_mask return attention_mask
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFLongformerWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TFLongformerPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Longformer
class TFLongformerLMHead(tf.keras.layers.Layer): class TFLongformerLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling.""" """Longformer Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -442,7 +563,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -442,7 +563,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -458,11 +579,16 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -458,11 +579,16 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
hidden_states = self.layer_norm(hidden_states) hidden_states = self.layer_norm(hidden_states)
# project back to size of vocabulary with bias # project back to size of vocabulary with bias
hidden_states = self.decoder(hidden_states, mode="linear") + self.bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->Longformer
class TFLongformerEmbeddings(tf.keras.layers.Layer): class TFLongformerEmbeddings(tf.keras.layers.Layer):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
...@@ -472,39 +598,27 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -472,39 +598,27 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.word_embeddings = TFLongformerWordEmbeddings(
self.hidden_size = config.hidden_size vocab_size=config.vocab_size,
self.initializer_range = config.initializer_range hidden_size=config.hidden_size,
self.position_embeddings = tf.keras.layers.Embedding( initializer_range=config.initializer_range,
config.max_position_embeddings, name="word_embeddings",
config.hidden_size, )
embeddings_initializer=get_initializer(self.initializer_range), self.position_embeddings = TFLongformerPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFLongformerTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.hidden_size, hidden_size=config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def create_position_ids_from_input_ids(self, input_ids): def create_position_ids_from_input_ids(self, input_ids):
""" """
...@@ -516,14 +630,16 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -516,14 +630,16 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
input_ids_shape = shape_list(input_ids) input_ids_shape = shape_list(tensor=input_ids)
# multiple choice has 3 dimensions # multiple choice has 3 dimensions
if len(input_ids_shape) == 3: if len(input_ids_shape) == 3:
input_ids = tf.reshape(input_ids, (input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])) input_ids = tf.reshape(
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
)
mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=tf.int32) mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(mask, axis=1) * mask incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask
return incremental_indices + self.padding_idx return incremental_indices + self.padding_idx
...@@ -536,96 +652,41 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -536,96 +652,41 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
seq_length = shape_list(inputs_embeds)[1] batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2]
position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :]
return position_ids return tf.tile(input=position_ids, multiples=(batch_size, 1))
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if position_ids is None:
if input_ids is not None: if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded. inputs_embeds = self.word_embeddings(input_ids=input_ids)
position_ids = self.create_position_ids_from_input_ids(input_ids)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = shape_list(input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) if position_ids is None:
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) if input_ids is not None:
embeddings = inputs_embeds + position_embeddings + token_type_embeddings # Create the position ids from the input token ids. Any padded tokens remain padded.
embeddings = self.LayerNorm(embeddings) position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
embeddings = self.dropout(embeddings, training=training) else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns: position_embeds = self.position_embeddings(position_ids=position_ids)
float32 tensor with shape [batch_size, length, vocab_size]. token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
""" final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
batch_size = shape_list(inputs)[0] final_embeddings = self.LayerNorm(inputs=final_embeddings)
length = shape_list(inputs)[1] final_embeddings = self.dropout(inputs=final_embeddings, training=training)
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate
...@@ -1613,11 +1674,11 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): ...@@ -1613,11 +1674,11 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
self.pooler = TFLongformerPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFLongformerPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -2053,7 +2114,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel ...@@ -2053,7 +2114,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer") self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer")
self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings, name="lm_head") self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -177,112 +177,173 @@ class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer): ...@@ -177,112 +177,173 @@ class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer):
return output return output
class TFLxmertEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
"""Construct the embeddings from word, position and token_type embeddings.""" class TFLxmertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range
self.position_embeddings = tf.keras.layers.Embedding( self.vocab_size = vocab_size
config.max_position_embeddings, self.hidden_size = hidden_size
config.hidden_size, self.initializer_range = initializer_range
embeddings_initializer=get_initializer(self.initializer_range),
name="position_embeddings", def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size, super().build(input_shape=input_shape)
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") return embeddings
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.token_type_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.type_vocab_size, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape)
def call(self, inputs, mode="embedding", training=False): super().build(input_shape=input_shape)
"""
Get token embeddings of inputs.
Args: def get_config(self):
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) config = {
mode: string, a valid value is one of "embedding" and "linear". "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
Returns: return dict(list(base_config.items()) + list(config.items()))
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length,
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises: def call(self, token_type_ids):
ValueError: if mode is not valid. flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
Shared weights logic adapted from embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(inputs, training=training)
elif mode == "linear":
return self._linear(inputs)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, inputs, training=False): return embeddings
"""Applies embedding based on inputs tensor."""
input_ids, token_type_ids, inputs_embeds = inputs
if input_ids is not None:
input_shape = shape_list(input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1] # Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] class TFLxmertPositionEmbeddings(tf.keras.layers.Layer):
if token_type_ids is None: def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
token_type_ids = tf.fill(input_shape, 0) super().__init__(**kwargs)
if inputs_embeds is None: self.max_position_embeddings = max_position_embeddings
inputs_embeds = tf.gather(self.word_embeddings, input_ids) self.hidden_size = hidden_size
position_embeddings = self.position_embeddings(position_ids) self.initializer_range = initializer_range
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings def build(self, input_shape):
embeddings = self.LayerNorm(embeddings) self.position_embeddings = self.add_weight(
embeddings = self.dropout(embeddings, training=training) name="embeddings",
return embeddings shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
def _linear(self, inputs): super().build(input_shape)
"""
Computes logits by running inputs through a linear layer.
Args: def get_config(self):
inputs: A float32 tensor with shape [batch_size, length, hidden_size] config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFLxmertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = TFLxmertWordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFLxmertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TFLxmertTokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size]) if input_ids is not None:
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) inputs_embeds = self.word_embeddings(input_ids=input_ids)
if token_type_ids is None:
input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) position_embeds = self.position_embeddings(position_ids=inputs_embeds)
token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFLxmertAttention(tf.keras.layers.Layer): class TFLxmertAttention(tf.keras.layers.Layer):
...@@ -703,11 +764,11 @@ class TFLxmertMainLayer(tf.keras.layers.Layer): ...@@ -703,11 +764,11 @@ class TFLxmertMainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError raise NotImplementedError
...@@ -787,7 +848,7 @@ class TFLxmertMainLayer(tf.keras.layers.Layer): ...@@ -787,7 +848,7 @@ class TFLxmertMainLayer(tf.keras.layers.Layer):
# Positional Word Embeddings # Positional Word Embeddings
embedding_output = self.embeddings( embedding_output = self.embeddings(
[inputs["input_ids"], inputs["token_type_ids"], inputs["inputs_embeds"]], training=inputs["training"] inputs["input_ids"], inputs["token_type_ids"], inputs["inputs_embeds"], training=inputs["training"]
) )
# Run Lxmert encoder # Run Lxmert encoder
...@@ -1066,31 +1127,38 @@ class TFLxmertPooler(tf.keras.layers.Layer): ...@@ -1066,31 +1127,38 @@ class TFLxmertPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert
class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act) self.transform_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states) hidden_states = self.LayerNorm(hidden_states)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert
class TFLxmertLMPredictionHead(tf.keras.layers.Layer): class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.transform = TFLxmertPredictionHeadTransform(config, name="transform") self.transform = TFLxmertPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -1099,13 +1167,14 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): ...@@ -1099,13 +1167,14 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -1116,12 +1185,17 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): ...@@ -1116,12 +1185,17 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert
class TFLxmertMLMHead(tf.keras.layers.Layer): class TFLxmertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -1130,6 +1204,7 @@ class TFLxmertMLMHead(tf.keras.layers.Layer): ...@@ -1130,6 +1204,7 @@ class TFLxmertMLMHead(tf.keras.layers.Layer):
def call(self, sequence_output): def call(self, sequence_output):
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(sequence_output)
return prediction_scores return prediction_scores
...@@ -1229,7 +1304,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel): ...@@ -1229,7 +1304,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
self.lxmert = TFLxmertMainLayer(config, name="lxmert") self.lxmert = TFLxmertMainLayer(config, name="lxmert")
# Pre-training heads # Pre-training heads
self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings, name="cls") self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings.word_embeddings, name="cls")
if self.task_obj_predict: if self.task_obj_predict:
self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head") self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head")
if self.task_qa: if self.task_qa:
......
...@@ -107,30 +107,150 @@ class TFNoNorm(tf.keras.layers.Layer): ...@@ -107,30 +107,150 @@ class TFNoNorm(tf.keras.layers.Layer):
NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm} NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm}
# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFMobileBertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFMobileBertEmbeddings(tf.keras.layers.Layer): class TFMobileBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.trigram_input = config.trigram_input self.trigram_input = config.trigram_input
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range self.word_embeddings = TFMobileBertWordEmbeddings(
vocab_size=config.vocab_size,
self.position_embeddings = tf.keras.layers.Embedding( hidden_size=config.embedding_size,
config.max_position_embeddings, initializer_range=config.initializer_range,
config.hidden_size, name="word_embeddings",
embeddings_initializer=get_initializer(self.initializer_range), )
self.position_embeddings = TFMobileBertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFMobileBertTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.hidden_size, hidden_size=config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation") self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation")
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
...@@ -138,71 +258,23 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -138,71 +258,23 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = NORM2FN[config.normalization_type]( self.LayerNorm = NORM2FN[config.normalization_type](
config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm"
) )
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
if self.trigram_input: if self.trigram_input:
# From the paper MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited # From the paper MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited
...@@ -224,32 +296,17 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -224,32 +296,17 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
if self.trigram_input or self.embedding_size != self.hidden_size: if self.trigram_input or self.embedding_size != self.hidden_size:
inputs_embeds = self.embedding_transformation(inputs_embeds) inputs_embeds = self.embedding_transformation(inputs_embeds)
position_embeddings = self.position_embeddings(position_ids) if position_ids is None:
token_type_embeddings = self.token_type_embeddings(token_type_ids) position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
embeddings = inputs_embeds + position_embeddings + token_type_embeddings position_embeds = self.position_embeddings(position_ids=position_ids)
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size]) token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
class TFMobileBertSelfAttention(tf.keras.layers.Layer): class TFMobileBertSelfAttention(tf.keras.layers.Layer):
...@@ -715,11 +772,11 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -715,11 +772,11 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
self.pooler = TFMobileBertPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFMobileBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
......
...@@ -86,143 +86,171 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel): ...@@ -86,143 +86,171 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel):
return self.serving_output(output) return self.serving_output(output)
class TFMPNetEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
"""Construct the embeddings from word, position embeddings.""" class TFMPNetWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.vocab_size = vocab_size
self.vocab_size = config.vocab_size self.hidden_size = hidden_size
self.hidden_size = config.hidden_size self.initializer_range = initializer_range
self.initializer_range = config.initializer_range
self.position_embeddings = tf.keras.layers.Embedding( def build(self, input_shape):
config.max_position_embeddings, self.weight = self.add_weight(
config.hidden_size, name="weight",
embeddings_initializer=get_initializer(self.initializer_range), shape=[self.vocab_size, self.hidden_size],
name="position_embeddings", initializer=get_initializer(initializer_range=self.initializer_range),
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load super().build(input_shape=input_shape)
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def get_config(self):
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings
class TFMPNetPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer""" self.position_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.max_position_embeddings, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape)
def create_position_ids_from_input_ids(self, x): def get_config(self):
""" config = {
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding "max_position_embeddings": self.max_position_embeddings,
symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param tf.Tensor x: :return "hidden_size": self.hidden_size,
tf.Tensor: "initializer_range": self.initializer_range,
""" }
mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) base_config = super().get_config()
incremental_indicies = tf.math.cumsum(mask, axis=1) * mask
return incremental_indicies + self.padding_idx return dict(list(base_config.items()) + list(config.items()))
def create_position_ids_from_inputs_embeds(self, inputs_embeds): def call(self, position_ids):
""" flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
:param tf.Tensor inputs_embeds: :return tf.Tensor: embeddings = tf.reshape(
""" tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0)
seq_length = shape_list(inputs_embeds)[1] )
position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :]
return position_ids embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size])
def call( return embeddings
self,
input_ids=None,
position_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
"""
Get token embeddings of inputs
Args:
inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids)
mode: string, a valid value is one of "embedding" and "linear"
Returns: class TFMPNetEmbeddings(tf.keras.layers.Layer):
outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, """Construct the embeddings from word, position embeddings."""
embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size]
Raises: def __init__(self, config, **kwargs):
ValueError: if mode is not valid. Shared weights logic adapted from super().__init__(**kwargs)
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, inputs_embeds, training=False): self.padding_idx = 1
"""Applies embedding based on inputs tensor.""" self.word_embeddings = TFMPNetWordEmbeddings(
assert not (input_ids is None and inputs_embeds is None) vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFMPNetPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
if position_ids is None: def create_position_ids_from_input_ids(self, input_ids):
if input_ids is not None: """
# Create the position ids from the input token ids. Any padded tokens remain padded. Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
position_ids = self.create_position_ids_from_input_ids(input_ids) symbols are ignored. This is modified from fairseq's `utils.make_positions`.
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
assert position_ids is None or len(position_ids.shape) <= 2 Args:
input_ids: tf.Tensor
if input_ids is not None: Returns: tf.Tensor
input_shape = shape_list(input_ids) """
else: input_ids_shape = shape_list(tensor=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1] # multiple choice has 3 dimensions
if len(input_ids_shape) == 3:
input_ids = tf.reshape(
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
)
if position_ids is None: mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype)
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask
if inputs_embeds is None: return incremental_indices + self.padding_idx
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) def create_position_ids_from_inputs_embeds(self, inputs_embeds):
embeddings = inputs_embeds + position_embeddings """
embeddings = self.LayerNorm(embeddings) We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
embeddings = self.dropout(embeddings, training=training)
return embeddings Args:
inputs_embeds: tf.Tensor
def _linear(self, inputs): Returns: tf.Tensor
""" """
Computes logits by running inputs through a linear layer batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2]
position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :]
Args: return tf.tile(input=position_ids, multiples=(batch_size, 1))
inputs: A float32 tensor with shape [batch_size, length, hidden_size
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) if input_ids is not None:
inputs_embeds = self.word_embeddings(input_ids=input_ids)
if position_ids is None:
if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds)
position_embeds = self.position_embeddings(position_ids=position_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler
...@@ -536,12 +564,12 @@ class TFMPNetMainLayer(tf.keras.layers.Layer): ...@@ -536,12 +564,12 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
...@@ -808,6 +836,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -808,6 +836,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -827,7 +856,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -827,7 +856,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -837,15 +866,19 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -837,15 +866,19 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, features): def call(self, hidden_states):
x = self.dense(features) hidden_states = self.dense(hidden_states)
x = self.act(x) hidden_states = self.act(hidden_states)
x = self.layer_norm(x) hidden_states = self.layer_norm(hidden_states)
# project back to size of vocabulary with bias # project back to size of vocabulary with bias
x = self.decoder(x, mode="linear") + self.bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return x return hidden_states
@add_start_docstrings("""MPNet Model with a `language modeling` head on top. """, MPNET_START_DOCSTRING) @add_start_docstrings("""MPNet Model with a `language modeling` head on top. """, MPNET_START_DOCSTRING)
...@@ -857,7 +890,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -857,7 +890,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.mpnet = TFMPNetMainLayer(config, name="mpnet") self.mpnet = TFMPNetMainLayer(config, name="mpnet")
self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings, name="lm_head") self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -65,6 +65,127 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -65,6 +65,127 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFRobertaWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings
class TFRobertaPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TFRobertaEmbeddings(tf.keras.layers.Layer): class TFRobertaEmbeddings(tf.keras.layers.Layer):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
...@@ -74,52 +195,48 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -74,52 +195,48 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.word_embeddings = TFRobertaWordEmbeddings(
self.hidden_size = config.hidden_size vocab_size=config.vocab_size,
self.initializer_range = config.initializer_range hidden_size=config.hidden_size,
self.position_embeddings = tf.keras.layers.Embedding( initializer_range=config.initializer_range,
config.max_position_embeddings, name="word_embeddings",
config.hidden_size, )
embeddings_initializer=get_initializer(self.initializer_range), self.position_embeddings = TFRobertaPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFRobertaTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.hidden_size, hidden_size=config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def create_position_ids_from_input_ids(self, x): def create_position_ids_from_input_ids(self, input_ids):
""" """
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`. symbols are ignored. This is modified from fairseq's `utils.make_positions`.
Args: Args:
x: tf.Tensor input_ids: tf.Tensor
Returns: tf.Tensor Returns: tf.Tensor
""" """
mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) input_ids_shape = shape_list(tensor=input_ids)
incremental_indices = tf.math.cumsum(mask, axis=1) * mask
# multiple choice has 3 dimensions
if len(input_ids_shape) == 3:
input_ids = tf.reshape(
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
)
mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask
return incremental_indices + self.padding_idx return incremental_indices + self.padding_idx
...@@ -132,96 +249,41 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -132,96 +249,41 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
seq_length = shape_list(inputs_embeds)[1] batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2]
position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :]
return position_ids return tf.tile(input=position_ids, multiples=(batch_size, 1))
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if position_ids is None:
if input_ids is not None: if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded. inputs_embeds = self.word_embeddings(input_ids=input_ids)
position_ids = self.create_position_ids_from_input_ids(input_ids)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = shape_list(input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype)
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args: if position_ids is None:
inputs: A float32 tensor with shape [batch_size, length, hidden_size] if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds)
Returns: position_embeds = self.position_embeddings(position_ids=position_ids)
float32 tensor with shape [batch_size, length, vocab_size]. token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
""" final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
batch_size = shape_list(inputs)[0] final_embeddings = self.LayerNorm(inputs=final_embeddings)
length = shape_list(inputs)[1] final_embeddings = self.dropout(inputs=final_embeddings, training=training)
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler
...@@ -245,7 +307,7 @@ class TFRobertaPooler(tf.keras.layers.Layer): ...@@ -245,7 +307,7 @@ class TFRobertaPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention
class TFRobertaSelfAttention(tf.keras.layers.Layer): class TFRobertaSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -256,8 +318,8 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -256,8 +318,8 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -293,7 +355,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -293,7 +355,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
if attention_mask is not None: if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
...@@ -324,9 +386,9 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer): ...@@ -324,9 +386,9 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -499,12 +561,12 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): ...@@ -499,12 +561,12 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
...@@ -814,6 +876,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -814,6 +876,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -833,7 +896,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -833,7 +896,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -849,7 +912,11 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -849,7 +912,11 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
hidden_states = self.layer_norm(hidden_states) hidden_states = self.layer_norm(hidden_states)
# project back to size of vocabulary with bias # project back to size of vocabulary with bias
hidden_states = self.decoder(hidden_states, mode="linear") + self.bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -863,7 +930,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -863,7 +930,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta") self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -66,128 +66,182 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -66,128 +66,182 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = vocab_size
self.hidden_size = config.hidden_size self.hidden_size = hidden_size
self.initializer_range = config.initializer_range self.initializer_range = initializer_range
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings, def build(self, input_shape):
config.hidden_size, self.weight = self.add_weight(
embeddings_initializer=get_initializer(self.initializer_range), name="weight",
name="position_embeddings", shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size, super().build(input_shape=input_shape)
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") return embeddings
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.token_type_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.type_vocab_size, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
) )
super().build(input_shape) super().build(input_shape=input_shape)
def call( def get_config(self):
self, config = {
input_ids=None, "type_vocab_size": self.type_vocab_size,
position_ids=None, "hidden_size": self.hidden_size,
token_type_ids=None, "initializer_range": self.initializer_range,
inputs_embeds=None, }
mode="embedding", base_config = super().get_config()
training=False,
):
"""
Get token embeddings of inputs.
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: def call(self, token_type_ids):
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
vocab_size]. embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
Raises: embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
ValueError: if mode is not valid.
Shared weights logic adapted from return embeddings
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: # Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
input_shape = shape_list(input_ids) class TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(tf.keras.layers.Layer):
else: def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
input_shape = shape_list(inputs_embeds)[:-1] super().__init__(**kwargs)
seq_length = input_shape[1] self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
if position_ids is None: def build(self, input_shape):
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
if token_type_ids is None: super().build(input_shape)
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None: def get_config(self):
inputs_embeds = tf.gather(self.word_embeddings, input_ids) config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) return dict(list(base_config.items()) + list(config.items()))
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
def _linear(self, inputs): return tf.broadcast_to(input=position_embeddings, shape=input_shape)
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]. # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size]) if input_ids is not None:
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) inputs_embeds = self.word_embeddings(input_ids=input_ids)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) if token_type_ids is None:
input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
position_embeds = self.position_embeddings(position_ids=position_ids)
token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention
class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -198,8 +252,8 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) ...@@ -198,8 +252,8 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -266,9 +320,9 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): ...@@ -266,9 +320,9 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -450,6 +504,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -450,6 +504,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform") self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -465,7 +521,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -465,7 +521,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -476,9 +532,12 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -476,9 +532,12 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -514,11 +573,11 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): ...@@ -514,11 +573,11 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model. """Prunes heads of the model.
...@@ -812,7 +871,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ...@@ -812,7 +871,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca
) )
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
...@@ -909,7 +968,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca ...@@ -909,7 +968,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca
logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`") logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`")
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
......
...@@ -760,31 +760,6 @@ class TFModelTesterMixin: ...@@ -760,31 +760,6 @@ class TFModelTesterMixin:
model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
) )
def _get_embeds(self, wte, input_ids):
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
try:
x = wte(input_ids, mode="embedding")
except Exception:
try:
x = wte([input_ids], mode="embedding")
except Exception:
try:
x = wte([input_ids, None, None, None], mode="embedding")
except Exception:
if hasattr(self.model_tester, "embedding_size"):
x = tf.ones(
input_ids.shape + [self.model_tester.embedding_size],
dtype=tf.dtypes.float32,
)
else:
x = tf.ones(
input_ids.shape + [self.model_tester.hidden_size],
dtype=tf.dtypes.float32,
)
return x
def test_inputs_embeds(self): def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -801,12 +776,11 @@ class TFModelTesterMixin: ...@@ -801,12 +776,11 @@ class TFModelTesterMixin:
del inputs["input_ids"] del inputs["input_ids"]
inputs.pop("decoder_input_ids", None) inputs.pop("decoder_input_ids", None)
wte = model.get_input_embeddings()
if not self.is_encoder_decoder: if not self.is_encoder_decoder:
inputs["inputs_embeds"] = self._get_embeds(wte, input_ids) inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
else: else:
inputs["inputs_embeds"] = self._get_embeds(wte, encoder_input_ids) inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
inputs["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids) inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
model(inputs) model(inputs)
...@@ -837,23 +811,24 @@ class TFModelTesterMixin: ...@@ -837,23 +811,24 @@ class TFModelTesterMixin:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def _get_word_embedding_weight(model, embedding_layer): def _get_word_embedding_weight(model, embedding_layer):
if hasattr(embedding_layer, "word_embeddings"): embeds = getattr(embedding_layer, "weight", None)
return embedding_layer.word_embeddings if embeds is not None:
elif hasattr(embedding_layer, "weight"): return embeds
return embedding_layer.weight
elif hasattr(embedding_layer, "decoder"): embeds = getattr(embedding_layer, "decoder", None)
return embedding_layer.decoder if embeds is not None:
else: return embeds
# Here we build the word embeddings weights if not exists.
# And then we retry to get the attribute once built.
model(model.dummy_inputs) model(model.dummy_inputs)
if hasattr(embedding_layer, "word_embeddings"):
return embedding_layer.word_embeddings embeds = getattr(embedding_layer, "weight", None)
elif hasattr(embedding_layer, "weight"): if embeds is not None:
return embedding_layer.weight return embeds
elif hasattr(embedding_layer, "decoder"):
return embedding_layer.decoder embeds = getattr(embedding_layer, "decoder", None)
else: if embeds is not None:
return embeds
return None return None
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment