Unverified Commit 14042d56 authored by Julien Plu's avatar Julien Plu Committed by GitHub
Browse files

New TF embeddings (cleaner and faster) (#9418)



* Create new embeddings + add to BERT

* Add Albert

* Add DistilBert

* Add Albert + Electra + Funnel

* Add Longformer + Lxmert

* Add last models

* Apply style

* Update the template

* Remove unused imports

* Rename attribute

* Import embeddings in their own model file

* Replace word_embeddings per weight

* fix naming

* Fix Albert

* Fix Albert

* Fix Longformer

* Fix Lxmert Mobilebert and MPNet

* Fix copy

* Fix template

* Update the get weights function

* Update src/transformers/modeling_tf_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/electra/modeling_tf_electra.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* address Sylvain's comments
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 12f0d7e8
...@@ -809,25 +809,29 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): ...@@ -809,25 +809,29 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
return model_embeds return model_embeds
def _get_word_embedding_weight(self, embedding_layer): def _get_word_embedding_weight(model, embedding_layer):
if hasattr(embedding_layer, "word_embeddings"): embeds = getattr(embedding_layer, "weight", None)
return embedding_layer.word_embeddings if embeds is not None:
elif hasattr(embedding_layer, "weight"): return embeds
return embedding_layer.weight
elif hasattr(embedding_layer, "decoder"): embeds = getattr(embedding_layer, "decoder", None)
return embedding_layer.decoder if embeds is not None:
else: return embeds
# Here we build the word embeddings weights if not exists.
# And then we retry to get the attribute once built. # The reason why the attributes don't exist might be
self(self.dummy_inputs) # because the model is not built, so retry getting
if hasattr(embedding_layer, "word_embeddings"): # the argument after building the model
return embedding_layer.word_embeddings model(model.dummy_inputs)
elif hasattr(embedding_layer, "weight"):
return embedding_layer.weight embeds = getattr(embedding_layer, "weight", None)
elif hasattr(embedding_layer, "decoder"): if embeds is not None:
return embedding_layer.decoder return embeds
else:
return None embeds = getattr(embedding_layer, "decoder", None)
if embeds is not None:
return embeds
return None
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self._get_word_embedding_weight(self.get_input_embeddings()) old_embeddings = self._get_word_embedding_weight(self.get_input_embeddings())
...@@ -1319,6 +1323,119 @@ class TFConv1D(tf.keras.layers.Layer): ...@@ -1319,6 +1323,119 @@ class TFConv1D(tf.keras.layers.Layer):
return x return x
class WordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.word_embeddings = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.word_embeddings, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
class PositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFSharedEmbeddings(tf.keras.layers.Layer): class TFSharedEmbeddings(tf.keras.layers.Layer):
r""" r"""
Construct shared token embeddings. Construct shared token embeddings.
......
...@@ -73,124 +73,178 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -73,124 +73,178 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
class TFAlbertEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
"""Construct the embeddings from word, position and token_type embeddings.""" class TFAlbertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
def __init__(self, config, **kwargs): self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFAlbertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.type_vocab_size = type_vocab_size
self.embedding_size = config.embedding_size self.hidden_size = hidden_size
self.initializer_range = config.initializer_range self.initializer_range = initializer_range
self.max_position_embeddings = config.max_position_embeddings
self.type_vocab_size = config.type_vocab_size def build(self, input_shape):
self.layer_norm_eps = config.layer_norm_eps self.token_type_embeddings = self.add_weight(
self.hidden_dropout_prob = config.hidden_dropout_prob name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
self.position_embeddings = tf.keras.layers.Embedding( initializer=get_initializer(initializer_range=self.initializer_range),
self.max_position_embeddings,
self.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range),
name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
self.type_vocab_size, super().build(input_shape=input_shape)
self.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="LayerNorm") return embeddings
self.dropout = tf.keras.layers.Dropout(self.hidden_dropout_prob)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFAlbertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.position_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.max_position_embeddings, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight( )
"weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape) super().build(input_shape)
def call( def get_config(self):
self, config = {
input_ids=None, "max_position_embeddings": self.max_position_embeddings,
position_ids=None, "hidden_size": self.hidden_size,
token_type_ids=None, "initializer_range": self.initializer_range,
inputs_embeds=None, }
mode="embedding", base_config = super().get_config()
training=False,
):
"""
Get token embeddings of inputs
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear"
Returns: def call(self, position_ids):
outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, input_shape = shape_list(tensor=position_ids)
embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length, position_embeddings = self.position_embeddings[: input_shape[1], :]
vocab_size]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFAlbertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
Raises: def __init__(self, config, **kwargs):
ValueError: if mode is not valid. super().__init__(**kwargs)
Shared weights logic adapted from self.word_embeddings = TFAlbertWordEmbeddings(
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 vocab_size=config.vocab_size,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFAlbertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TFAlbertTokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
""" """
if mode == "embedding": Applies embedding based on inputs tensor.
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): Returns:
"""Applies embedding based on inputs tensor.""" final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings if position_ids is None:
embeddings = self.LayerNorm(embeddings) position_embeds = self.position_embeddings(position_ids=inputs_embeds)
embeddings = self.dropout(embeddings, training=training) else:
return embeddings position_embeds = self.position_embeddings(position_ids=position_ids)
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer
Args: token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
inputs: A float32 tensor with shape [batch_size, length, embedding_size final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
Returns: return final_embeddings
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.embedding_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
class TFAlbertSelfOutput(tf.keras.layers.Layer): class TFAlbertSelfOutput(tf.keras.layers.Layer):
...@@ -446,8 +500,9 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): ...@@ -446,8 +500,9 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
class TFAlbertMLMHead(tf.keras.layers.Layer): class TFAlbertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.vocab_size = config.vocab_size
self.embedding_size = config.embedding_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -474,7 +529,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -474,7 +529,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -486,10 +541,15 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -486,10 +541,15 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.activation(hidden_states) hidden_states = self.activation(inputs=hidden_states)
hidden_states = self.LayerNorm(hidden_states) hidden_states = self.LayerNorm(inputs=hidden_states)
hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
return hidden_states return hidden_states
...@@ -516,11 +576,11 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -516,11 +576,11 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
) )
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -844,7 +904,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): ...@@ -844,7 +904,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions")
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
def get_lm_head(self): def get_lm_head(self):
...@@ -964,7 +1024,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -964,7 +1024,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings.word_embeddings, name="predictions")
def get_lm_head(self): def get_lm_head(self):
return self.predictions return self.predictions
......
...@@ -121,124 +121,174 @@ class TFBertPreTrainingLoss: ...@@ -121,124 +121,174 @@ class TFBertPreTrainingLoss:
return masked_lm_loss + next_sentence_loss return masked_lm_loss + next_sentence_loss
class TFBertEmbeddings(tf.keras.layers.Layer): class TFBertWordEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = vocab_size
self.hidden_size = config.hidden_size self.hidden_size = hidden_size
self.initializer_range = config.initializer_range self.initializer_range = initializer_range
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings, def build(self, input_shape):
config.hidden_size, self.weight = self.add_weight(
embeddings_initializer=get_initializer(self.initializer_range), name="weight",
name="position_embeddings", shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size, super().build(input_shape=input_shape)
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") return embeddings
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
class TFBertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.token_type_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.type_vocab_size, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight( )
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape) super().build(input_shape=input_shape)
def call( def get_config(self):
self, config = {
input_ids=None, "type_vocab_size": self.type_vocab_size,
position_ids=None, "hidden_size": self.hidden_size,
token_type_ids=None, "initializer_range": self.initializer_range,
inputs_embeds=None, }
mode="embedding", base_config = super().get_config()
training=False,
):
"""
Get token embeddings of inputs.
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: def call(self, token_type_ids):
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
vocab_size]. embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
Raises: embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
ValueError: if mode is not valid.
Shared weights logic adapted from return embeddings
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: class TFBertPositionEmbeddings(tf.keras.layers.Layer):
input_shape = shape_list(input_ids) def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
else: super().__init__(**kwargs)
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1] self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
if position_ids is None: def build(self, input_shape):
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
if token_type_ids is None: super().build(input_shape)
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None: def get_config(self):
inputs_embeds = tf.gather(self.word_embeddings, input_ids) config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) return dict(list(base_config.items()) + list(config.items()))
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
def _linear(self, inputs): return tf.broadcast_to(input=position_embeddings, shape=input_shape)
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]. class TFBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = TFBertWordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFBertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TFBertTokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size]) if input_ids is not None:
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) inputs_embeds = self.word_embeddings(input_ids=input_ids)
if token_type_ids is None:
input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
position_embeds = self.position_embeddings(position_ids=position_ids)
token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfAttention(tf.keras.layers.Layer):
...@@ -251,8 +301,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -251,8 +301,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -318,9 +368,9 @@ class TFBertSelfOutput(tf.keras.layers.Layer): ...@@ -318,9 +368,9 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -516,6 +566,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -516,6 +566,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.transform = TFBertPredictionHeadTransform(config, name="transform") self.transform = TFBertPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -531,7 +583,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -531,7 +583,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -542,9 +594,12 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -542,9 +594,12 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -583,21 +638,17 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -583,21 +638,17 @@ class TFBertMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.num_hidden_layers = config.num_hidden_layers
self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFBertEmbeddings(config, name="embeddings") self.embeddings = TFBertEmbeddings(config, name="embeddings")
self.encoder = TFBertEncoder(config, name="encoder") self.encoder = TFBertEncoder(config, name="encoder")
self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -682,7 +733,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -682,7 +733,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
if inputs["head_mask"] is not None: if inputs["head_mask"] is not None:
raise NotImplementedError raise NotImplementedError
else: else:
inputs["head_mask"] = [None] * self.num_hidden_layers inputs["head_mask"] = [None] * self.config.num_hidden_layers
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, embedding_output,
...@@ -931,7 +982,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): ...@@ -931,7 +982,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.nsp = TFBertNSPHead(config, name="nsp___cls") self.nsp = TFBertNSPHead(config, name="nsp___cls")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
...@@ -1055,7 +1106,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -1055,7 +1106,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
) )
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
...@@ -1158,7 +1209,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1158,7 +1209,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`") logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`")
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
......
...@@ -67,104 +67,128 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -67,104 +67,128 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
class TFEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
def __init__(self, config, **kwargs): class TFDistilBertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dim = config.dim self.vocab_size = vocab_size
self.initializer_range = config.initializer_range self.hidden_size = hidden_size
self.position_embeddings = tf.keras.layers.Embedding( self.initializer_range = initializer_range
config.max_position_embeddings,
config.dim, def build(self, input_shape):
embeddings_initializer=get_initializer(config.initializer_range), self.weight = self.add_weight(
name="position_embeddings", name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") super().build(input_shape=input_shape)
self.dropout = tf.keras.layers.Dropout(config.dropout)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFDistilBertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.position_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.max_position_embeddings, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight( )
"weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
)
super().build(input_shape) super().build(input_shape)
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, mode="embedding", training=False): def get_config(self):
""" config = {
Get token embeddings of inputs. "max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: def call(self, position_ids):
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, input_shape = shape_list(tensor=position_ids)
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, position_embeddings = self.position_embeddings[: input_shape[1], :]
vocab_size].
Raises: return tf.broadcast_to(input=position_embeddings, shape=input_shape)
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, inputs_embeds, training=False): class TFEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dim = config.dim
self.initializer_range = config.initializer_range
self.word_embeddings = TFDistilBertWordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.dim,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFDistilBertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.dim,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.dropout)
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
""" """
Parameters: Applies embedding based on inputs tensor.
input_ids: tf.Tensor(bs, max_seq_length) The token ids to embed.
Returns: Returns:
tf.Tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
seq_length = shape_list(input_ids)[1] inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
seq_length = shape_list(inputs_embeds)[1]
if position_ids is None: if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
if inputs_embeds is None: position_embeds = self.position_embeddings(position_ids=position_ids)
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(
self.position_embeddings(position_ids), inputs_embeds.dtype
) # (bs, max_seq_length, dim)
embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim)
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.dim]) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
class TFMultiHeadSelfAttention(tf.keras.layers.Layer): class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
...@@ -397,11 +421,11 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -397,11 +421,11 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
self.transformer = TFTransformer(config, name="transformer") # Encoder self.transformer = TFTransformer(config, name="transformer") # Encoder
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = value.shape[0] self.embeddings.word_embeddings.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError raise NotImplementedError
...@@ -636,7 +660,9 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -636,7 +660,9 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
class TFDistilBertLMHead(tf.keras.layers.Layer): class TFDistilBertLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.dim = config.dim
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
...@@ -644,13 +670,14 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -644,13 +670,14 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -661,8 +688,12 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -661,8 +688,12 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -681,7 +712,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel ...@@ -681,7 +712,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
) )
self.act = get_tf_activation("gelu") self.act = get_tf_activation("gelu")
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") self.vocab_projector = TFDistilBertLMHead(
config, self.distilbert.embeddings.word_embeddings, name="vocab_projector"
)
def get_lm_head(self): def get_lm_head(self):
return self.vocab_projector return self.vocab_projector
......
...@@ -70,6 +70,122 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -70,6 +70,122 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFElectraWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFElectraTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFElectraPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra
class TFElectraSelfAttention(tf.keras.layers.Layer): class TFElectraSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -81,8 +197,8 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): ...@@ -81,8 +197,8 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -138,7 +254,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): ...@@ -138,7 +254,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
return outputs return outputs
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput
class TFElectraSelfOutput(tf.keras.layers.Layer): class TFElectraSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -149,9 +265,9 @@ class TFElectraSelfOutput(tf.keras.layers.Layer): ...@@ -149,9 +265,9 @@ class TFElectraSelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -331,120 +447,56 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ...@@ -331,120 +447,56 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.word_embeddings = TFElectraWordEmbeddings(
self.embedding_size = config.embedding_size vocab_size=config.vocab_size,
self.initializer_range = config.initializer_range hidden_size=config.embedding_size,
self.position_embeddings = tf.keras.layers.Embedding( initializer_range=config.initializer_range,
config.max_position_embeddings, name="word_embeddings",
config.embedding_size, )
embeddings_initializer=get_initializer(self.initializer_range), self.position_embeddings = TFElectraPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.embedding_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFElectraTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.embedding_size, hidden_size=config.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings.call with Albert->Electra
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings._embedding
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) if position_ids is None:
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) position_embeds = self.position_embeddings(position_ids=inputs_embeds)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings else:
embeddings = self.LayerNorm(embeddings) position_embeds = self.position_embeddings(position_ids=position_ids)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns: token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
""" final_embeddings = self.LayerNorm(inputs=final_embeddings)
batch_size = shape_list(inputs)[0] final_embeddings = self.dropout(inputs=final_embeddings, training=training)
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.embedding_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer): class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer):
...@@ -508,11 +560,11 @@ class TFElectraMainLayer(tf.keras.layers.Layer): ...@@ -508,11 +560,11 @@ class TFElectraMainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -903,6 +955,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -903,6 +955,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.embedding_size = config.embedding_size
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
...@@ -914,7 +967,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -914,7 +967,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -924,9 +977,12 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -924,9 +977,12 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states, training=False): def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -953,7 +1009,9 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -953,7 +1009,9 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
else: else:
self.activation = config.hidden_act self.activation = config.hidden_act
self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") self.generator_lm_head = TFElectraMaskedLMHead(
config, self.electra.embeddings.word_embeddings, name="generator_lm_head"
)
def get_lm_head(self): def get_lm_head(self):
return self.generator_lm_head return self.generator_lm_head
......
...@@ -74,89 +74,78 @@ TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -74,89 +74,78 @@ TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
INF = 1e6 INF = 1e6
class TFFunnelEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
"""Construct the embeddings from word embeddings.""" class TFFunnelWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.vocab_size = vocab_size
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.weight = self.add_weight(
with tf.name_scope("word_embeddings"): name="weight",
# Create and initialize weights. The random normal initializer was chosen shape=[self.vocab_size, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight( )
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def call( super().build(input_shape=input_shape)
self,
input_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
"""
Get token embeddings of inputs
Args: def get_config(self):
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) config = {
mode: string, a valid value is one of "embedding" and "linear" "vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
Returns: return dict(list(base_config.items()) + list(config.items()))
outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length,
embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size]
Raises: def call(self, input_ids):
ValueError: if mode is not valid. flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
Shared weights logic adapted from embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, inputs_embeds, training=False): return embeddings
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
embeddings = self.layer_norm(inputs_embeds)
embeddings = self.dropout(embeddings, training=training)
return embeddings class TFFunnelEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def _linear(self, inputs): def __init__(self, config, **kwargs):
""" super().__init__(**kwargs)
Computes logits by running inputs through a linear layer
Args: self.word_embeddings = TFFunnelWordEmbeddings(
inputs: A float32 tensor with shape [batch_size, length, hidden_size vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout)
def call(self, input_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1] assert not (input_ids is not None and inputs_embeds is not None)
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) if input_ids is not None:
inputs_embeds = self.word_embeddings(input_ids=input_ids)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) final_embeddings = self.LayerNorm(inputs=inputs_embeds)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFFunnelAttentionStructure: class TFFunnelAttentionStructure:
...@@ -784,11 +773,11 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer): ...@@ -784,11 +773,11 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer):
self.encoder = TFFunnelEncoder(config, name="encoder") self.encoder = TFFunnelEncoder(config, name="encoder")
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
...@@ -870,11 +859,11 @@ class TFFunnelMainLayer(tf.keras.layers.Layer): ...@@ -870,11 +859,11 @@ class TFFunnelMainLayer(tf.keras.layers.Layer):
self.decoder = TFFunnelDecoder(config, name="decoder") self.decoder = TFFunnelDecoder(config, name="decoder")
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
...@@ -987,17 +976,19 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): ...@@ -987,17 +976,19 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.input_embeddings = input_embeddings self.input_embeddings = input_embeddings
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -1008,8 +999,12 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer): ...@@ -1008,8 +999,12 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states, training=False): def call(self, hidden_states, training=False):
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -1362,7 +1357,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -1362,7 +1357,7 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss)
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.funnel = TFFunnelMainLayer(config, name="funnel") self.funnel = TFFunnelMainLayer(config, name="funnel")
self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head") self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -415,14 +415,135 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se ...@@ -415,14 +415,135 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se
return attention_mask return attention_mask
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFLongformerWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFLongformerTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TFLongformerPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Longformer
class TFLongformerLMHead(tf.keras.layers.Layer): class TFLongformerLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling.""" """Longformer Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -442,7 +563,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -442,7 +563,7 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -458,11 +579,16 @@ class TFLongformerLMHead(tf.keras.layers.Layer): ...@@ -458,11 +579,16 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
hidden_states = self.layer_norm(hidden_states) hidden_states = self.layer_norm(hidden_states)
# project back to size of vocabulary with bias # project back to size of vocabulary with bias
hidden_states = self.decoder(hidden_states, mode="linear") + self.bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->Longformer
class TFLongformerEmbeddings(tf.keras.layers.Layer): class TFLongformerEmbeddings(tf.keras.layers.Layer):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
...@@ -472,39 +598,27 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -472,39 +598,27 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.word_embeddings = TFLongformerWordEmbeddings(
self.hidden_size = config.hidden_size vocab_size=config.vocab_size,
self.initializer_range = config.initializer_range hidden_size=config.hidden_size,
self.position_embeddings = tf.keras.layers.Embedding( initializer_range=config.initializer_range,
config.max_position_embeddings, name="word_embeddings",
config.hidden_size, )
embeddings_initializer=get_initializer(self.initializer_range), self.position_embeddings = TFLongformerPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFLongformerTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.hidden_size, hidden_size=config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def create_position_ids_from_input_ids(self, input_ids): def create_position_ids_from_input_ids(self, input_ids):
""" """
...@@ -516,14 +630,16 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -516,14 +630,16 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
input_ids_shape = shape_list(input_ids) input_ids_shape = shape_list(tensor=input_ids)
# multiple choice has 3 dimensions # multiple choice has 3 dimensions
if len(input_ids_shape) == 3: if len(input_ids_shape) == 3:
input_ids = tf.reshape(input_ids, (input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])) input_ids = tf.reshape(
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
)
mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=tf.int32) mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(mask, axis=1) * mask incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask
return incremental_indices + self.padding_idx return incremental_indices + self.padding_idx
...@@ -536,96 +652,41 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -536,96 +652,41 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
seq_length = shape_list(inputs_embeds)[1] batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2]
position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :]
return position_ids return tf.tile(input=position_ids, multiples=(batch_size, 1))
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if position_ids is None:
if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(input_ids)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) if position_ids is None:
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) if input_ids is not None:
embeddings = inputs_embeds + position_embeddings + token_type_embeddings # Create the position ids from the input token ids. Any padded tokens remain padded.
embeddings = self.LayerNorm(embeddings) position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
embeddings = self.dropout(embeddings, training=training) else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns: position_embeds = self.position_embeddings(position_ids=position_ids)
float32 tensor with shape [batch_size, length, vocab_size]. token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
""" final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
batch_size = shape_list(inputs)[0] final_embeddings = self.LayerNorm(inputs=final_embeddings)
length = shape_list(inputs)[1] final_embeddings = self.dropout(inputs=final_embeddings, training=training)
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate
...@@ -1613,11 +1674,11 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): ...@@ -1613,11 +1674,11 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
self.pooler = TFLongformerPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFLongformerPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -2053,7 +2114,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel ...@@ -2053,7 +2114,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer") self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer")
self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings, name="lm_head") self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -177,112 +177,173 @@ class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer): ...@@ -177,112 +177,173 @@ class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer):
return output return output
class TFLxmertEmbeddings(tf.keras.layers.Layer): # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
"""Construct the embeddings from word, position and token_type embeddings.""" class TFLxmertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range
self.position_embeddings = tf.keras.layers.Embedding( self.vocab_size = vocab_size
config.max_position_embeddings, self.hidden_size = hidden_size
config.hidden_size, self.initializer_range = initializer_range
embeddings_initializer=get_initializer(self.initializer_range),
name="position_embeddings", def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size, super().build(input_shape=input_shape)
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape): return embeddings
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def call(self, inputs, mode="embedding", training=False):
"""
Get token embeddings of inputs.
Args: # Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) class TFLxmertTokenTypeEmbeddings(tf.keras.layers.Layer):
mode: string, a valid value is one of "embedding" and "linear". def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
Returns: self.type_vocab_size = type_vocab_size
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, self.hidden_size = hidden_size
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, self.initializer_range = initializer_range
vocab_size].
Raises: def build(self, input_shape):
ValueError: if mode is not valid. self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
Shared weights logic adapted from super().build(input_shape=input_shape)
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(inputs, training=training)
elif mode == "linear":
return self._linear(inputs)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, inputs, training=False): def get_config(self):
"""Applies embedding based on inputs tensor.""" config = {
input_ids, token_type_ids, inputs_embeds = inputs "type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
if input_ids is not None: return dict(list(base_config.items()) + list(config.items()))
input_shape = shape_list(input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1] def call(self, token_type_ids):
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
if token_type_ids is None: one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
token_type_ids = tf.fill(input_shape, 0) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
if inputs_embeds is None: embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args: # Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
inputs: A float32 tensor with shape [batch_size, length, hidden_size] class TFLxmertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFLxmertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = TFLxmertWordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TFLxmertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TFLxmertTokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1]
if input_ids is not None:
inputs_embeds = self.word_embeddings(input_ids=input_ids)
if token_type_ids is None:
input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
x = tf.reshape(inputs, [-1, self.hidden_size]) position_embeds = self.position_embeddings(position_ids=inputs_embeds)
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
class TFLxmertAttention(tf.keras.layers.Layer): class TFLxmertAttention(tf.keras.layers.Layer):
...@@ -703,11 +764,11 @@ class TFLxmertMainLayer(tf.keras.layers.Layer): ...@@ -703,11 +764,11 @@ class TFLxmertMainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError raise NotImplementedError
...@@ -787,7 +848,7 @@ class TFLxmertMainLayer(tf.keras.layers.Layer): ...@@ -787,7 +848,7 @@ class TFLxmertMainLayer(tf.keras.layers.Layer):
# Positional Word Embeddings # Positional Word Embeddings
embedding_output = self.embeddings( embedding_output = self.embeddings(
[inputs["input_ids"], inputs["token_type_ids"], inputs["inputs_embeds"]], training=inputs["training"] inputs["input_ids"], inputs["token_type_ids"], inputs["inputs_embeds"], training=inputs["training"]
) )
# Run Lxmert encoder # Run Lxmert encoder
...@@ -1066,31 +1127,38 @@ class TFLxmertPooler(tf.keras.layers.Layer): ...@@ -1066,31 +1127,38 @@ class TFLxmertPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert
class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act) self.transform_act_fn = get_tf_activation(config.hidden_act)
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states) hidden_states = self.LayerNorm(hidden_states)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert
class TFLxmertLMPredictionHead(tf.keras.layers.Layer): class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.transform = TFLxmertPredictionHeadTransform(config, name="transform") self.transform = TFLxmertPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -1099,13 +1167,14 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): ...@@ -1099,13 +1167,14 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
def build(self, input_shape): def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape) super().build(input_shape)
def get_output_embeddings(self): def get_output_embeddings(self):
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -1116,12 +1185,17 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer): ...@@ -1116,12 +1185,17 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert
class TFLxmertMLMHead(tf.keras.layers.Layer): class TFLxmertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -1130,6 +1204,7 @@ class TFLxmertMLMHead(tf.keras.layers.Layer): ...@@ -1130,6 +1204,7 @@ class TFLxmertMLMHead(tf.keras.layers.Layer):
def call(self, sequence_output): def call(self, sequence_output):
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(sequence_output)
return prediction_scores return prediction_scores
...@@ -1229,7 +1304,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel): ...@@ -1229,7 +1304,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
self.lxmert = TFLxmertMainLayer(config, name="lxmert") self.lxmert = TFLxmertMainLayer(config, name="lxmert")
# Pre-training heads # Pre-training heads
self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings, name="cls") self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings.word_embeddings, name="cls")
if self.task_obj_predict: if self.task_obj_predict:
self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head") self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head")
if self.task_qa: if self.task_qa:
......
...@@ -107,30 +107,150 @@ class TFNoNorm(tf.keras.layers.Layer): ...@@ -107,30 +107,150 @@ class TFNoNorm(tf.keras.layers.Layer):
NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm} NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm}
# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFMobileBertWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFMobileBertTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
class TFMobileBertPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
return tf.broadcast_to(input=position_embeddings, shape=input_shape)
class TFMobileBertEmbeddings(tf.keras.layers.Layer): class TFMobileBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.trigram_input = config.trigram_input self.trigram_input = config.trigram_input
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range self.word_embeddings = TFMobileBertWordEmbeddings(
vocab_size=config.vocab_size,
self.position_embeddings = tf.keras.layers.Embedding( hidden_size=config.embedding_size,
config.max_position_embeddings, initializer_range=config.initializer_range,
config.hidden_size, name="word_embeddings",
embeddings_initializer=get_initializer(self.initializer_range), )
self.position_embeddings = TFMobileBertPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFMobileBertTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.hidden_size, hidden_size=config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation") self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation")
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
...@@ -138,71 +258,23 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -138,71 +258,23 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
self.LayerNorm = NORM2FN[config.normalization_type]( self.LayerNorm = NORM2FN[config.normalization_type](
config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm"
) )
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
if self.trigram_input: if self.trigram_input:
# From the paper MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited # From the paper MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited
...@@ -224,32 +296,17 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -224,32 +296,17 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
if self.trigram_input or self.embedding_size != self.hidden_size: if self.trigram_input or self.embedding_size != self.hidden_size:
inputs_embeds = self.embedding_transformation(inputs_embeds) inputs_embeds = self.embedding_transformation(inputs_embeds)
position_embeddings = self.position_embeddings(position_ids) if position_ids is None:
token_type_embeddings = self.token_type_embeddings(token_type_ids) position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
embeddings = inputs_embeds + position_embeddings + token_type_embeddings position_embeds = self.position_embeddings(position_ids=position_ids)
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size]) token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
class TFMobileBertSelfAttention(tf.keras.layers.Layer): class TFMobileBertSelfAttention(tf.keras.layers.Layer):
...@@ -715,11 +772,11 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -715,11 +772,11 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
self.pooler = TFMobileBertPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFMobileBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
......
...@@ -86,6 +86,86 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel): ...@@ -86,6 +86,86 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel):
return self.serving_output(output) return self.serving_output(output)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFMPNetWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings
class TFMPNetPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TFMPNetEmbeddings(tf.keras.layers.Layer): class TFMPNetEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position embeddings.""" """Construct the embeddings from word, position embeddings."""
...@@ -93,136 +173,84 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): ...@@ -93,136 +173,84 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.word_embeddings = TFMPNetWordEmbeddings(
self.hidden_size = config.hidden_size vocab_size=config.vocab_size,
self.initializer_range = config.initializer_range hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
self.position_embeddings = tf.keras.layers.Embedding( name="word_embeddings",
config.max_position_embeddings, )
config.hidden_size, self.position_embeddings = TFMPNetPositionEmbeddings(
embeddings_initializer=get_initializer(self.initializer_range), max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer"""
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def create_position_ids_from_input_ids(self, x): def create_position_ids_from_input_ids(self, input_ids):
""" """
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param tf.Tensor x: :return symbols are ignored. This is modified from fairseq's `utils.make_positions`.
tf.Tensor:
Args:
input_ids: tf.Tensor
Returns: tf.Tensor
""" """
mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) input_ids_shape = shape_list(tensor=input_ids)
incremental_indicies = tf.math.cumsum(mask, axis=1) * mask
# multiple choice has 3 dimensions
if len(input_ids_shape) == 3:
input_ids = tf.reshape(
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
)
mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask
return incremental_indicies + self.padding_idx return incremental_indices + self.padding_idx
def create_position_ids_from_inputs_embeds(self, inputs_embeds): def create_position_ids_from_inputs_embeds(self, inputs_embeds):
""" """
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
:param tf.Tensor inputs_embeds: :return tf.Tensor:
"""
seq_length = shape_list(inputs_embeds)[1]
position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :]
return position_ids Args:
inputs_embeds: tf.Tensor
def call( Returns: tf.Tensor
self,
input_ids=None,
position_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2]
position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :]
Args: return tf.tile(input=position_ids, multiples=(batch_size, 1))
inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids)
mode: string, a valid value is one of "embedding" and "linear"
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length,
embedding_size]; (2) mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size]
Raises: def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
ValueError: if mode is not valid. Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding": Applies embedding based on inputs tensor.
return self._embedding(input_ids, position_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, inputs_embeds, training=False): Returns:
"""Applies embedding based on inputs tensor.""" final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
inputs_embeds = self.word_embeddings(input_ids=input_ids)
if position_ids is None: if position_ids is None:
if input_ids is not None: if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded. # Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(input_ids) position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
else: else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds)
assert position_ids is None or len(position_ids.shape) <= 2 position_embeds = self.position_embeddings(position_ids=position_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
if input_ids is not None: return final_embeddings
input_shape = shape_list(input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""
Computes logits by running inputs through a linear layer
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler
...@@ -536,12 +564,12 @@ class TFMPNetMainLayer(tf.keras.layers.Layer): ...@@ -536,12 +564,12 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
...@@ -808,6 +836,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -808,6 +836,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -827,7 +856,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -827,7 +856,7 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -837,15 +866,19 @@ class TFMPNetLMHead(tf.keras.layers.Layer): ...@@ -837,15 +866,19 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
self.bias = value["bias"] self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, features): def call(self, hidden_states):
x = self.dense(features) hidden_states = self.dense(hidden_states)
x = self.act(x) hidden_states = self.act(hidden_states)
x = self.layer_norm(x) hidden_states = self.layer_norm(hidden_states)
# project back to size of vocabulary with bias # project back to size of vocabulary with bias
x = self.decoder(x, mode="linear") + self.bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return x return hidden_states
@add_start_docstrings("""MPNet Model with a `language modeling` head on top. """, MPNET_START_DOCSTRING) @add_start_docstrings("""MPNet Model with a `language modeling` head on top. """, MPNET_START_DOCSTRING)
...@@ -857,7 +890,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -857,7 +890,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.mpnet = TFMPNetMainLayer(config, name="mpnet") self.mpnet = TFMPNetMainLayer(config, name="mpnet")
self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings, name="lm_head") self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -65,6 +65,127 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -65,6 +65,127 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TFRobertaWordEmbeddings(tf.keras.layers.Layer):
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
name="weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TFRobertaTokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.type_vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape=input_shape)
def get_config(self):
config = {
"type_vocab_size": self.type_vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, token_type_ids):
flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
return embeddings
# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerPositionEmbeddings
class TFRobertaPositionEmbeddings(tf.keras.layers.Layer):
def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
super().build(input_shape)
def get_config(self):
config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, position_ids):
flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1])
embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0)
)
embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size])
return embeddings
class TFRobertaEmbeddings(tf.keras.layers.Layer): class TFRobertaEmbeddings(tf.keras.layers.Layer):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
...@@ -74,52 +195,48 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -74,52 +195,48 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.padding_idx = 1 self.padding_idx = 1
self.vocab_size = config.vocab_size self.word_embeddings = TFRobertaWordEmbeddings(
self.hidden_size = config.hidden_size vocab_size=config.vocab_size,
self.initializer_range = config.initializer_range hidden_size=config.hidden_size,
self.position_embeddings = tf.keras.layers.Embedding( initializer_range=config.initializer_range,
config.max_position_embeddings, name="word_embeddings",
config.hidden_size, )
embeddings_initializer=get_initializer(self.initializer_range), self.position_embeddings = TFRobertaPositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings", name="position_embeddings",
) )
self.token_type_embeddings = tf.keras.layers.Embedding( self.token_type_embeddings = TFRobertaTokenTypeEmbeddings(
config.type_vocab_size, type_vocab_size=config.type_vocab_size,
config.hidden_size, hidden_size=config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), initializer_range=config.initializer_range,
name="token_type_embeddings", name="token_type_embeddings",
) )
self.embeddings_sum = tf.keras.layers.Add()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def create_position_ids_from_input_ids(self, x): def create_position_ids_from_input_ids(self, input_ids):
""" """
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`. symbols are ignored. This is modified from fairseq's `utils.make_positions`.
Args: Args:
x: tf.Tensor input_ids: tf.Tensor
Returns: tf.Tensor Returns: tf.Tensor
""" """
mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) input_ids_shape = shape_list(tensor=input_ids)
incremental_indices = tf.math.cumsum(mask, axis=1) * mask
# multiple choice has 3 dimensions
if len(input_ids_shape) == 3:
input_ids = tf.reshape(
tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2])
)
mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask
return incremental_indices + self.padding_idx return incremental_indices + self.padding_idx
...@@ -132,96 +249,41 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -132,96 +249,41 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
seq_length = shape_list(inputs_embeds)[1] batch_size, seq_length = shape_list(tensor=inputs_embeds)[:2]
position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(start=self.padding_idx + 1, limit=seq_length + self.padding_idx + 1)[tf.newaxis, :]
return position_ids return tf.tile(input=position_ids, multiples=(batch_size, 1))
def call( def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
""" """
Get token embeddings of inputs. Applies embedding based on inputs tensor.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length,
vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None) assert not (input_ids is None and inputs_embeds is None)
if position_ids is None:
if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(input_ids)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None: if input_ids is not None:
input_shape = shape_list(input_ids) inputs_embeds = self.word_embeddings(input_ids=input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype)
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs): if position_ids is None:
""" if input_ids is not None:
Computes logits by running inputs through a linear layer. # Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
Args: else:
inputs: A float32 tensor with shape [batch_size, length, hidden_size] position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds=inputs_embeds)
Returns: position_embeds = self.position_embeddings(position_ids=position_ids)
float32 tensor with shape [batch_size, length, vocab_size]. token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
""" final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
batch_size = shape_list(inputs)[0] final_embeddings = self.LayerNorm(inputs=final_embeddings)
length = shape_list(inputs)[1] final_embeddings = self.dropout(inputs=final_embeddings, training=training)
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler
...@@ -245,7 +307,7 @@ class TFRobertaPooler(tf.keras.layers.Layer): ...@@ -245,7 +307,7 @@ class TFRobertaPooler(tf.keras.layers.Layer):
return pooled_output return pooled_output
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention
class TFRobertaSelfAttention(tf.keras.layers.Layer): class TFRobertaSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -256,8 +318,8 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -256,8 +318,8 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -293,7 +355,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): ...@@ -293,7 +355,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
if attention_mask is not None: if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
...@@ -324,9 +386,9 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer): ...@@ -324,9 +386,9 @@ class TFRobertaSelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -499,12 +561,12 @@ class TFRobertaMainLayer(tf.keras.layers.Layer): ...@@ -499,12 +561,12 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
...@@ -814,6 +876,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -814,6 +876,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -833,7 +896,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -833,7 +896,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
return self.decoder return self.decoder
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.decoder.word_embeddings = value self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0] self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -849,7 +912,11 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -849,7 +912,11 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
hidden_states = self.layer_norm(hidden_states) hidden_states = self.layer_norm(hidden_states)
# project back to size of vocabulary with bias # project back to size of vocabulary with bias
hidden_states = self.decoder(hidden_states, mode="linear") + self.bias seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -863,7 +930,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -863,7 +930,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta") self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings.word_embeddings, name="lm_head")
def get_lm_head(self): def get_lm_head(self):
return self.lm_head return self.lm_head
......
...@@ -66,128 +66,182 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -66,128 +66,182 @@ TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} # Copied from transformers.models.bert.modeling_tf_bert.TFBertWordEmbeddings
class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = vocab_size
self.hidden_size = config.hidden_size self.hidden_size = hidden_size
self.initializer_range = config.initializer_range self.initializer_range = initializer_range
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings, def build(self, input_shape):
config.hidden_size, self.weight = self.add_weight(
embeddings_initializer=get_initializer(self.initializer_range), name="weight",
name="position_embeddings", shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
) )
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size, super().build(input_shape=input_shape)
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range), def get_config(self):
name="token_type_embeddings", config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids):
flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1])
embeddings = tf.gather(params=self.weight, indices=flat_input_ids)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0)
) )
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size])
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") return embeddings
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertTokenTypeEmbeddings
class TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(tf.keras.layers.Layer):
def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs):
super().__init__(**kwargs)
self.type_vocab_size = type_vocab_size
self.hidden_size = hidden_size
self.initializer_range = initializer_range
def build(self, input_shape): def build(self, input_shape):
"""Build shared word embedding layer """ self.token_type_embeddings = self.add_weight(
with tf.name_scope("word_embeddings"): name="embeddings",
# Create and initialize weights. The random normal initializer was chosen shape=[self.type_vocab_size, self.hidden_size],
# arbitrarily, and works well. initializer=get_initializer(initializer_range=self.initializer_range),
self.word_embeddings = self.add_weight( )
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape) super().build(input_shape=input_shape)
def call( def get_config(self):
self, config = {
input_ids=None, "type_vocab_size": self.type_vocab_size,
position_ids=None, "hidden_size": self.hidden_size,
token_type_ids=None, "initializer_range": self.initializer_range,
inputs_embeds=None, }
mode="embedding", base_config = super().get_config()
training=False,
):
"""
Get token embeddings of inputs.
Args: return dict(list(base_config.items()) + list(config.items()))
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns: def call(self, token_type_ids):
outputs: If mode == "embedding", output embedding tensor, float32 with shape [batch_size, length, flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1])
embedding_size]; if mode == "linear", output linear tensor, float32 with shape [batch_size, length, one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype)
vocab_size]. embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings)
embeddings = tf.reshape(
tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0)
)
Raises: embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size])
ValueError: if mode is not valid.
Shared weights logic adapted from return embeddings
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None: # Copied from transformers.models.bert.modeling_tf_bert.TFBertPositionEmbeddings
input_shape = shape_list(input_ids) class TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(tf.keras.layers.Layer):
else: def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs):
input_shape = shape_list(inputs_embeds)[:-1] super().__init__(**kwargs)
seq_length = input_shape[1] self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.initializer_range = initializer_range
if position_ids is None: def build(self, input_shape):
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
if token_type_ids is None: super().build(input_shape)
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None: def get_config(self):
inputs_embeds = tf.gather(self.word_embeddings, input_ids) config = {
"max_position_embeddings": self.max_position_embeddings,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) return dict(list(base_config.items()) + list(config.items()))
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings def call(self, position_ids):
input_shape = shape_list(tensor=position_ids)
position_embeddings = self.position_embeddings[: input_shape[1], :]
def _linear(self, inputs): return tf.broadcast_to(input=position_embeddings, shape=input_shape)
"""
Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]. # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = TF{{cookiecutter.camelcase_modelname}}WordEmbeddings(
vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="word_embeddings",
)
self.position_embeddings = TF{{cookiecutter.camelcase_modelname}}PositionEmbeddings(
max_position_embeddings=config.max_position_embeddings,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="position_embeddings",
)
self.token_type_embeddings = TF{{cookiecutter.camelcase_modelname}}TokenTypeEmbeddings(
type_vocab_size=config.type_vocab_size,
hidden_size=config.hidden_size,
initializer_range=config.initializer_range,
name="token_type_embeddings",
)
self.embeddings_sum = tf.keras.layers.Add()
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
""" """
batch_size = shape_list(inputs)[0] assert not (input_ids is None and inputs_embeds is None)
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size]) if input_ids is not None:
logits = tf.matmul(x, self.word_embeddings, transpose_b=True) inputs_embeds = self.word_embeddings(input_ids=input_ids)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) if token_type_ids is None:
input_shape = shape_list(tensor=inputs_embeds)[:-1]
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_embeds = self.position_embeddings(position_ids=inputs_embeds)
else:
position_embeds = self.position_embeddings(position_ids=position_ids)
token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids)
final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds])
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention
class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -198,8 +252,8 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) ...@@ -198,8 +252,8 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.query = tf.keras.layers.experimental.EinsumDense( self.query = tf.keras.layers.experimental.EinsumDense(
equation="abc,cde->abde", equation="abc,cde->abde",
output_shape=(None, config.num_attention_heads, self.attention_head_size), output_shape=(None, config.num_attention_heads, self.attention_head_size),
...@@ -266,9 +320,9 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): ...@@ -266,9 +320,9 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer):
f"of attention heads ({config.num_attention_heads})" f"of attention heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = config.num_attention_heads * self.attention_head_size
self.dense = tf.keras.layers.experimental.EinsumDense( self.dense = tf.keras.layers.experimental.EinsumDense(
equation="abcd,cde->abe", equation="abcd,cde->abe",
output_shape=(None, self.all_head_size), output_shape=(None, self.all_head_size),
...@@ -450,6 +504,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -450,6 +504,8 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform") self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
...@@ -465,7 +521,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -465,7 +521,7 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value): def set_output_embeddings(self, value):
self.input_embeddings.word_embeddings = value self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0] self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self): def get_bias(self):
...@@ -476,9 +532,12 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay ...@@ -476,9 +532,12 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
self.vocab_size = shape_list(value["bias"])[0] self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states): def call(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states=hidden_states)
hidden_states = self.input_embeddings(hidden_states, mode="linear") seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = hidden_states + self.bias hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states return hidden_states
...@@ -514,11 +573,11 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): ...@@ -514,11 +573,11 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer):
self.config = config self.config = config
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, value): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0] self.embeddings.word_embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model. """Prunes heads of the model.
...@@ -812,7 +871,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca ...@@ -812,7 +871,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca
) )
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
...@@ -909,7 +968,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca ...@@ -909,7 +968,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca
logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`") logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`")
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls")
def get_lm_head(self): def get_lm_head(self):
return self.mlm.predictions return self.mlm.predictions
......
...@@ -760,31 +760,6 @@ class TFModelTesterMixin: ...@@ -760,31 +760,6 @@ class TFModelTesterMixin:
model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
) )
def _get_embeds(self, wte, input_ids):
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
try:
x = wte(input_ids, mode="embedding")
except Exception:
try:
x = wte([input_ids], mode="embedding")
except Exception:
try:
x = wte([input_ids, None, None, None], mode="embedding")
except Exception:
if hasattr(self.model_tester, "embedding_size"):
x = tf.ones(
input_ids.shape + [self.model_tester.embedding_size],
dtype=tf.dtypes.float32,
)
else:
x = tf.ones(
input_ids.shape + [self.model_tester.hidden_size],
dtype=tf.dtypes.float32,
)
return x
def test_inputs_embeds(self): def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -801,12 +776,11 @@ class TFModelTesterMixin: ...@@ -801,12 +776,11 @@ class TFModelTesterMixin:
del inputs["input_ids"] del inputs["input_ids"]
inputs.pop("decoder_input_ids", None) inputs.pop("decoder_input_ids", None)
wte = model.get_input_embeddings()
if not self.is_encoder_decoder: if not self.is_encoder_decoder:
inputs["inputs_embeds"] = self._get_embeds(wte, input_ids) inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
else: else:
inputs["inputs_embeds"] = self._get_embeds(wte, encoder_input_ids) inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
inputs["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids) inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
model(inputs) model(inputs)
...@@ -837,24 +811,25 @@ class TFModelTesterMixin: ...@@ -837,24 +811,25 @@ class TFModelTesterMixin:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def _get_word_embedding_weight(model, embedding_layer): def _get_word_embedding_weight(model, embedding_layer):
if hasattr(embedding_layer, "word_embeddings"): embeds = getattr(embedding_layer, "weight", None)
return embedding_layer.word_embeddings if embeds is not None:
elif hasattr(embedding_layer, "weight"): return embeds
return embedding_layer.weight
elif hasattr(embedding_layer, "decoder"): embeds = getattr(embedding_layer, "decoder", None)
return embedding_layer.decoder if embeds is not None:
else: return embeds
# Here we build the word embeddings weights if not exists.
# And then we retry to get the attribute once built. model(model.dummy_inputs)
model(model.dummy_inputs)
if hasattr(embedding_layer, "word_embeddings"): embeds = getattr(embedding_layer, "weight", None)
return embedding_layer.word_embeddings if embeds is not None:
elif hasattr(embedding_layer, "weight"): return embeds
return embedding_layer.weight
elif hasattr(embedding_layer, "decoder"): embeds = getattr(embedding_layer, "decoder", None)
return embedding_layer.decoder if embeds is not None:
else: return embeds
return None
return None
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
for size in [config.vocab_size - 10, config.vocab_size + 10, None]: for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment