Unverified Commit 5340d1f2 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into resumable_http

parents 0e4cc050 10bd1ddb
...@@ -35,6 +35,8 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -35,6 +35,8 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin", 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin", 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
} }
class RobertaEmbeddings(BertEmbeddings): class RobertaEmbeddings(BertEmbeddings):
...@@ -48,16 +50,24 @@ class RobertaEmbeddings(BertEmbeddings): ...@@ -48,16 +50,24 @@ class RobertaEmbeddings(BertEmbeddings):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
padding_idx=self.padding_idx) padding_idx=self.padding_idx)
def forward(self, input_ids, token_type_ids=None, position_ids=None): def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
seq_length = input_ids.size(1) if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
device = input_ids.device if input_ids is not None else inputs_embeds.device
if position_ids is None: if position_ids is None:
# Position numbers begin at padding_idx+1. Padding symbols are ignored. # Position numbers begin at padding_idx+1. Padding symbols are ignored.
# cf. fairseq's `utils.make_positions` # cf. fairseq's `utils.make_positions`
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device) position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) position_ids = position_ids.unsqueeze(0).expand(input_shape)
return super(RobertaEmbeddings, self).forward(input_ids, return super(RobertaEmbeddings, self).forward(input_ids,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids) position_ids=position_ids,
inputs_embeds=inputs_embeds)
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
...@@ -126,6 +136,10 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -126,6 +136,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -169,18 +183,11 @@ class RobertaModel(BertModel): ...@@ -169,18 +183,11 @@ class RobertaModel(BertModel):
self.embeddings = RobertaEmbeddings(config) self.embeddings = RobertaEmbeddings(config)
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): def get_input_embeddings(self):
if input_ids[:, 0].sum().item() != 0: return self.embeddings.word_embeddings
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your tokenize.encode()"
"or tokenizer.convert_tokens_to_ids().")
return super(RobertaModel, self).forward(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
...@@ -225,21 +232,18 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -225,21 +232,18 @@ class RobertaForMaskedLM(BertPreTrainedModel):
self.lm_head = RobertaLMHead(config) self.lm_head = RobertaLMHead(config)
self.init_weights() self.init_weights()
self.tie_weights()
def tie_weights(self): def get_output_embeddings(self):
""" Make sure we are sharing the input and output embeddings. return self.lm_head.decoder
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
masked_lm_labels=None): masked_lm_labels=None):
outputs = self.roberta(input_ids, outputs = self.roberta(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
...@@ -320,13 +324,14 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -320,13 +324,14 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
self.classifier = RobertaClassificationHead(config) self.classifier = RobertaClassificationHead(config)
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
labels=None): labels=None):
outputs = self.roberta(input_ids, outputs = self.roberta(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
...@@ -343,6 +348,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -343,6 +348,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions) return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
...@@ -382,6 +388,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -382,6 +388,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
...@@ -425,8 +435,8 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -425,8 +435,8 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None,
position_ids=None, head_mask=None): position_ids=None, head_mask=None, inputs_embeds=None):
num_choices = input_ids.shape[1] num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_input_ids = input_ids.view(-1, input_ids.size(-1))
...@@ -451,6 +461,82 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -451,6 +461,82 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class RobertaForTokenClassification(BertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
config_class = RobertaConfig
pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "roberta"
def __init__(self, config):
super(RobertaForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
self.roberta = RobertaModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.roberta(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), scores, (hidden_states), (attentions)
class RobertaClassificationHead(nn.Module): class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
......
# coding=utf-8
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 ALBERT model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .configuration_albert import AlbertConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .file_utils import add_start_docstrings
import logging
logger = logging.getLogger(__name__)
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-tf_model.h5",
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-tf_model.h5",
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-tf_model.h5",
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-tf_model.h5",
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
}
class TFAlbertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config, **kwargs):
super(TFAlbertEmbeddings, self).__init__(**kwargs)
self.config = config
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
config.embedding_size,
embeddings_initializer=get_initializer(
self.config.initializer_range),
name='position_embeddings')
self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
config.embedding_size,
embeddings_initializer=get_initializer(
self.config.initializer_range),
name='token_type_embeddings')
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.config.vocab_size, self.config.embedding_size],
initializer=get_initializer(self.config.initializer_range))
super(TFAlbertEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding", training=False):
"""Get token embeddings of inputs.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(inputs, training=training)
elif mode == "linear":
return self._linear(inputs)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, inputs, training=False):
"""Applies embedding based on inputs tensor."""
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
if input_ids is not None:
input_shape = tf.shape(input_ids)
else:
input_shape = tf.shape(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, embedding_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = tf.shape(inputs)[0]
length = tf.shape(inputs)[1]
x = tf.reshape(inputs, [-1, self.config.embedding_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.config.vocab_size])
class TFAlbertSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFAlbertSelfAttention, self).__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.output_attentions = config.output_attentions
self.num_attention_heads = config.num_attention_heads
assert config.hidden_size % config.num_attention_heads == 0
self.attention_head_size = int(
config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='query')
self.key = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='key')
self.value = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='value')
self.dropout = tf.keras.layers.Dropout(
config.attention_probs_dropout_prob)
def transpose_for_scores(self, x, batch_size):
x = tf.reshape(
x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
batch_size = tf.shape(hidden_states)[0]
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# Take the dot product between "query" and "key" to get the raw attention scores.
# (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
# scale attention_scores
dk = tf.cast(tf.shape(key_layer)[-1], tf.float32)
attention_scores = attention_scores / tf.math.sqrt(dk)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs, training=training)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(context_layer,
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
outputs = (context_layer, attention_probs) if self.output_attentions else (
context_layer,)
return outputs
class TFAlbertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFAlbertSelfOutput, self).__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='dense')
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, inputs, training=False):
hidden_states, input_tensor = inputs
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class TFAlbertAttention(TFBertSelfAttention):
def __init__(self, config, **kwargs):
super(TFAlbertAttention, self).__init__(config, **kwargs)
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='dense')
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
self.pruned_heads = set()
def prune_heads(self, heads):
raise NotImplementedError
def call(self, inputs, training=False):
input_tensor, attention_mask, head_mask = inputs
batch_size = tf.shape(input_tensor)[0]
mixed_query_layer = self.query(input_tensor)
mixed_key_layer = self.key(input_tensor)
mixed_value_layer = self.value(input_tensor)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# Take the dot product between "query" and "key" to get the raw attention scores.
# (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
# scale attention_scores
dk = tf.cast(tf.shape(key_layer)[-1], tf.float32)
attention_scores = attention_scores / tf.math.sqrt(dk)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs, training=training)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(context_layer,
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
self_outputs = (context_layer, attention_probs) if self.output_attentions else (
context_layer,)
hidden_states = self_outputs[0]
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
attention_output = self.LayerNorm(hidden_states + input_tensor)
# add attentions if we output them
outputs = (attention_output,) + self_outputs[1:]
return outputs
class TFAlbertLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFAlbertLayer, self).__init__(**kwargs)
self.attention = TFAlbertAttention(config, name='attention')
self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
config.initializer_range), name='ffn')
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
config.initializer_range), name='ffn_output')
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
attention_outputs = self.attention(
[hidden_states, attention_mask, head_mask], training=training)
ffn_output = self.ffn(attention_outputs[0])
ffn_output = self.activation(ffn_output)
ffn_output = self.ffn_output(ffn_output)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.full_layer_layer_norm(
ffn_output + attention_outputs[0])
# add attentions if we output them
outputs = (hidden_states,) + attention_outputs[1:]
return outputs
class TFAlbertLayerGroup(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFAlbertLayerGroup, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
i)) for i in range(config.inner_group_num)]
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
layer_hidden_states = ()
layer_attentions = ()
for layer_index, albert_layer in enumerate(self.albert_layers):
layer_output = albert_layer(
[hidden_states, attention_mask, head_mask[layer_index]], training=training)
hidden_states = layer_output[0]
if self.output_attentions:
layer_attentions = layer_attentions + (layer_output[1],)
if self.output_hidden_states:
layer_hidden_states = layer_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
outputs = outputs + (layer_hidden_states,)
if self.output_attentions:
outputs = outputs + (layer_attentions,)
# last-layer hidden state, (layer hidden states), (layer attentions)
return outputs
class TFAlbertTransformer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFAlbertTransformer, self).__init__(**kwargs)
self.config = config
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
config.initializer_range), name='embedding_hidden_mapping_in')
self.albert_layer_groups = [TFAlbertLayerGroup(
config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
all_attentions = ()
if self.output_hidden_states:
all_hidden_states = (hidden_states,)
for i in range(self.config.num_hidden_layers):
# Number of layers in a hidden group
layers_per_group = int(
self.config.num_hidden_layers / self.config.num_hidden_groups)
# Index of the hidden group
group_idx = int(
i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
layer_group_output = self.albert_layer_groups[group_idx](
[hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
hidden_states = layer_group_output[0]
if self.output_attentions:
all_attentions = all_attentions + layer_group_output[-1]
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
if self.output_attentions:
outputs = outputs + (all_attentions,)
# last-layer hidden state, (all hidden states), (all attentions)
return outputs
class TFAlbertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = AlbertConfig
pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "albert"
class TFAlbertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super(TFAlbertMLMHead, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dense = tf.keras.layers.Dense(config.embedding_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='dense')
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='decoder/bias')
super(TFAlbertMLMHead, self).build(input_shape)
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias
hidden_states = hidden_states + self.bias
return hidden_states
ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in
`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
https://arxiv.org/abs/1909.11942
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
ALBERT_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
To match pre-training, ALBERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `ALBERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
class TFAlbertModel(TFAlbertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the output of the last layer of the model.
**pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Albert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
model = TFAlbertModel.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, **kwargs):
super(TFAlbertModel, self).__init__(config, **kwargs)
self.num_hidden_layers = config.num_hidden_layers
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder")
self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
config.initializer_range), activation='tanh', name='pooler')
def get_input_embeddings(self):
return self.embeddings
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise NotImplementedError
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = tf.shape(input_ids)
elif inputs_embeds is not None:
input_shape = inputs_embeds.shape[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
embedding_output = self.embeddings(
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
encoder_outputs = self.encoder(
[embedding_output, extended_attention_mask, head_mask], training=training)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output[:, 0])
# add hidden_states and attentions if they are here
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]
# sequence_output, pooled_output, (hidden_states), (attentions)
return outputs
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMaskedLM
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
self.albert = TFAlbertModel(config, name='albert')
self.predictions = TFAlbertMLMHead(
config, self.albert.embeddings, name='predictions')
def get_output_embeddings(self):
return self.albert.embeddings
def call(self, inputs, **kwargs):
outputs = self.albert(inputs, **kwargs)
sequence_output = outputs[0]
prediction_scores = self.predictions(
sequence_output, training=kwargs.get('training', False))
# Add hidden states and attention if they are here
outputs = (prediction_scores,) + outputs[2:]
return outputs # prediction_scores, (hidden_states), (attentions)
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.albert = TFAlbertModel(config, name='albert')
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
def call(self, inputs, **kwargs):
outputs = self.albert(inputs, **kwargs)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # logits, (hidden_states), (attentions)
\ No newline at end of file
...@@ -142,19 +142,25 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -142,19 +142,25 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
def _embedding(self, inputs, training=False): def _embedding(self, inputs, training=False):
"""Applies embedding based on inputs tensor.""" """Applies embedding based on inputs tensor."""
input_ids, position_ids, token_type_ids = inputs input_ids, position_ids, token_type_ids, inputs_embeds = inputs
seq_length = tf.shape(input_ids)[1] if input_ids is not None:
input_shape = tf.shape(input_ids)
else:
input_shape = tf.shape(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None: if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(tf.shape(input_ids), 0) token_type_ids = tf.fill(input_shape, 0)
words_embeddings = tf.gather(self.word_embeddings, input_ids) if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = self.position_embeddings(position_ids) position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings) embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training) embeddings = self.dropout(embeddings, training=training)
return embeddings return embeddings
...@@ -460,6 +466,9 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -460,6 +466,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
self.encoder = TFBertEncoder(config, name='encoder') self.encoder = TFBertEncoder(config, name='encoder')
self.pooler = TFBertPooler(config, name='pooler') self.pooler = TFBertPooler(config, name='pooler')
def get_input_embeddings(self):
return self.embeddings
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError raise NotImplementedError
...@@ -470,28 +479,39 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -470,28 +479,39 @@ class TFBertMainLayer(tf.keras.layers.Layer):
""" """
raise NotImplementedError raise NotImplementedError
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids) position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.shape
elif inputs_embeds is not None:
input_shape = inputs_embeds.shape[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None: if attention_mask is None:
attention_mask = tf.fill(tf.shape(input_ids), 1) attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(tf.shape(input_ids), 0) token_type_ids = tf.fill(input_shape, 0)
# We create a 3D attention mask from a 2D tensor mask. # We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length] # Sizes are [batch_size, 1, 1, to_seq_length]
...@@ -520,7 +540,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -520,7 +540,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.num_hidden_layers head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers) # head_mask = tf.constant([0] * self.num_hidden_layers)
embedding_output = self.embeddings([input_ids, position_ids, token_type_ids], training=training) embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
...@@ -616,6 +636,10 @@ BERT_INPUTS_DOCSTRING = r""" ...@@ -616,6 +636,10 @@ BERT_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -698,6 +722,9 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -698,6 +722,9 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
self.nsp = TFBertNSPHead(config, name='nsp___cls') self.nsp = TFBertNSPHead(config, name='nsp___cls')
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls') self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
def get_output_embeddings(self):
return self.bert.embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
...@@ -743,6 +770,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -743,6 +770,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
self.bert = TFBertMainLayer(config, name='bert') self.bert = TFBertMainLayer(config, name='bert')
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls') self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
def get_output_embeddings(self):
return self.bert.embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
...@@ -888,33 +918,39 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -888,33 +918,39 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name='classifier') name='classifier')
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids) position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
num_choices = tf.shape(input_ids)[1] if input_ids is not None:
seq_length = tf.shape(input_ids)[2] num_choices = tf.shape(input_ids)[1]
seq_length = tf.shape(input_ids)[2]
else:
num_choices = tf.shape(inputs_embeds)[1]
seq_length = tf.shape(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask] flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
outputs = self.bert(flat_inputs, training=training) outputs = self.bert(flat_inputs, training=training)
......
...@@ -192,6 +192,9 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -192,6 +192,9 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
name='h_._{}'.format(i)) for i in range(config.n_layer)] name='h_._{}'.format(i)) for i in range(config.n_layer)]
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
def get_input_embeddings(self):
return self.w
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError raise NotImplementedError
...@@ -201,7 +204,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -201,7 +204,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
""" """
raise NotImplementedError raise NotImplementedError
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past past = inputs[1] if len(inputs) > 1 else past
...@@ -209,7 +212,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -209,7 +212,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
assert len(inputs) <= 6, "Too many inputs." inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
past = inputs.get('past', past) past = inputs.get('past', past)
...@@ -217,12 +221,20 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -217,12 +221,20 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids) position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 6, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 7, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
input_shape = shape_list(input_ids) if input_ids is not None and inputs_embeds is not None:
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if past is None: if past is None:
past_length = 0 past_length = 0
...@@ -230,8 +242,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -230,8 +242,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
else: else:
past_length = shape_list(past[0][0])[-2] past_length = shape_list(past[0][0])[-2]
if position_ids is None: if position_ids is None:
position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
position_ids = tf.tile(position_ids, [shape_list(input_ids)[0], 1]) position_ids = tf.tile(position_ids, [input_shape[0], 1])
# Attention mask. # Attention mask.
if attention_mask is not None: if attention_mask is not None:
...@@ -270,8 +282,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -270,8 +282,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
token_type_embeds = 0 token_type_embeds = 0
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
inputs_embeds = self.w(input_ids, mode='embedding') if inputs_embeds is None:
# x = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded inputs_embeds = self.w(input_ids, mode='embedding')
seq_len = input_shape[-1] seq_len = input_shape[-1]
mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
...@@ -374,6 +386,10 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs: ...@@ -374,6 +386,10 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -476,6 +492,9 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -476,6 +492,9 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head") self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
def get_output_embeddings(self):
return self.lm_head.input_embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -96,7 +96,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -96,7 +96,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
initializer=get_initializer(self.initializer_range)) initializer=get_initializer(self.initializer_range))
super(TFEmbeddings, self).build(input_shape) super(TFEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding", training=False): def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
Args: Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
...@@ -112,13 +112,13 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -112,13 +112,13 @@ class TFEmbeddings(tf.keras.layers.Layer):
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding": if mode == "embedding":
return self._embedding(inputs, training=training) return self._embedding(inputs, inputs_embeds=inputs_embeds, training=training)
elif mode == "linear": elif mode == "linear":
return self._linear(inputs) return self._linear(inputs)
else: else:
raise ValueError("mode {} is not valid.".format(mode)) raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, inputs, training=False): def _embedding(self, inputs, inputs_embeds=None, training=False):
""" """
Parameters Parameters
---------- ----------
...@@ -136,14 +136,19 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -136,14 +136,19 @@ class TFEmbeddings(tf.keras.layers.Layer):
else: else:
input_ids, position_ids = inputs input_ids, position_ids = inputs
seq_length = tf.shape(input_ids)[1] if input_ids is not None:
seq_length = tf.shape(input_ids)[1]
else:
seq_length = tf.shape(inputs_embeds)[1]
if position_ids is None: if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
word_embeddings = tf.gather(self.word_embeddings, input_ids) if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim)
embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim)
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim)
return embeddings return embeddings
...@@ -398,28 +403,42 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -398,28 +403,42 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
self.transformer = TFTransformer(config, name="transformer") # Encoder self.transformer = TFTransformer(config, name="transformer") # Encoder
def get_input_embeddings(self):
return self.embeddings
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError raise NotImplementedError
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError raise NotImplementedError
def call(self, inputs, attention_mask=None, head_mask=None, training=False): def call(self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
assert len(inputs) <= 3, "Too many inputs." inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
assert len(inputs) <= 4, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 3, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 4, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None: if attention_mask is None:
attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length) attention_mask = tf.ones(input_shape) # (bs, seq_length)
attention_mask = tf.cast(attention_mask, dtype=tf.float32) attention_mask = tf.cast(attention_mask, dtype=tf.float32)
# Prepare head mask if needed # Prepare head mask if needed
...@@ -432,7 +451,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -432,7 +451,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
else: else:
head_mask = [None] * self.num_hidden_layers head_mask = [None] * self.num_hidden_layers
embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim) embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim)
tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training) tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
...@@ -508,6 +527,10 @@ DISTILBERT_INPUTS_DOCSTRING = r""" ...@@ -508,6 +527,10 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
...@@ -609,6 +632,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -609,6 +632,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
def get_output_embeddings(self):
return self.vocab_projector.input_embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
distilbert_output = self.distilbert(inputs, **kwargs) distilbert_output = self.distilbert(inputs, **kwargs)
......
...@@ -219,6 +219,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -219,6 +219,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
name='h_._{}'.format(i)) for i in range(config.n_layer)] name='h_._{}'.format(i)) for i in range(config.n_layer)]
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f') self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
def get_input_embeddings(self):
return self.wte
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError raise NotImplementedError
...@@ -228,7 +231,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -228,7 +231,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
""" """
raise NotImplementedError raise NotImplementedError
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past past = inputs[1] if len(inputs) > 1 else past
...@@ -236,7 +239,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -236,7 +239,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
assert len(inputs) <= 6, "Too many inputs." inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
past = inputs.get('past', past) past = inputs.get('past', past)
...@@ -244,17 +248,28 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -244,17 +248,28 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids) position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 6, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 7, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if past is None: if past is None:
past_length = 0 past_length = 0
past = [None] * len(self.h) past = [None] * len(self.h)
else: else:
past_length = shape_list(past[0][0])[-2] past_length = shape_list(past[0][0])[-2]
if position_ids is None: if position_ids is None:
position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
if attention_mask is not None: if attention_mask is not None:
# We create a 3D attention mask from a 2D tensor mask. # We create a 3D attention mask from a 2D tensor mask.
...@@ -286,11 +301,10 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -286,11 +301,10 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.num_hidden_layers head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers) # head_mask = tf.constant([0] * self.num_hidden_layers)
input_shape = shape_list(input_ids)
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
inputs_embeds = self.wte(input_ids, mode='embedding') if inputs_embeds is None:
inputs_embeds = self.wte(input_ids, mode='embedding')
position_embeds = self.wpe(position_ids) position_embeds = self.wpe(position_ids)
if token_type_ids is not None: if token_type_ids is not None:
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
...@@ -408,6 +422,10 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: ...@@ -408,6 +422,10 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -486,6 +504,9 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -486,6 +504,9 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs) super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name='transformer') self.transformer = TFGPT2MainLayer(config, name='transformer')
def get_output_embeddings(self):
return self.transformer.wte
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -556,7 +577,10 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -556,7 +577,10 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
self.transformer = TFGPT2MainLayer(config, name='transformer') self.transformer = TFGPT2MainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): def get_output_embeddings(self):
return self.transformer.wte
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past past = inputs[1] if len(inputs) > 1 else past
...@@ -564,8 +588,9 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -564,8 +588,9 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
assert len(inputs) <= 7, "Too many inputs." mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
past = inputs.get('past', past) past = inputs.get('past', past)
...@@ -573,21 +598,25 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -573,21 +598,25 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids) position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids) mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
assert len(inputs) <= 7, "Too many inputs." assert len(inputs) <= 8, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
input_shapes = shape_list(input_ids) if input_ids is not None:
input_shapes = shape_list(input_ids)
else:
input_shapes = shape_list(inputs_embeds)[:-1]
seq_length = input_shapes[-1] seq_length = input_shapes[-1]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask] flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
transformer_outputs = self.transformer(flat_inputs, training=training) transformer_outputs = self.transformer(flat_inputs, training=training)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -217,6 +217,9 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -217,6 +217,9 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
scale=True, scale=True,
name='h_._{}'.format(i)) for i in range(config.n_layer)] name='h_._{}'.format(i)) for i in range(config.n_layer)]
def get_input_embeddings(self):
return self.tokens_embed
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError raise NotImplementedError
...@@ -226,26 +229,38 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -226,26 +229,38 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
""" """
raise NotImplementedError raise NotImplementedError
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids) position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if position_ids is None: if position_ids is None:
position_ids = tf.range(shape_list(input_ids)[-1], dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(input_shape[-1], dtype=tf.int32)[tf.newaxis, :]
if attention_mask is not None: if attention_mask is not None:
# We create a 3D attention mask from a 2D tensor mask. # We create a 3D attention mask from a 2D tensor mask.
...@@ -277,11 +292,10 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -277,11 +292,10 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.num_hidden_layers head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers) # head_mask = tf.constant([0] * self.num_hidden_layers)
input_shape = shape_list(input_ids)
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
inputs_embeds = self.tokens_embed(input_ids, mode='embedding') if inputs_embeds is None:
inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
position_embeds = self.positions_embed(position_ids) position_embeds = self.positions_embed(position_ids)
if token_type_ids is not None: if token_type_ids is not None:
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
...@@ -389,6 +403,10 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: ...@@ -389,6 +403,10 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
...@@ -458,6 +476,9 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -458,6 +476,9 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs) super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
def get_output_embeddings(self):
return self.transformer.tokens_embed
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -520,36 +541,44 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -520,36 +541,44 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): def get_output_embeddings(self):
return self.transformer.tokens_embed
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
mc_token_ids = inputs[5] if len(inputs) > 5 else mc_token_ids inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs." mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids) position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids) mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
assert len(inputs) <= 6, "Too many inputs." assert len(inputs) <= 7, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
input_shapes = shape_list(input_ids) if input_ids is not None:
input_shapes = shape_list(input_ids)
else:
input_shapes = shape_list(inputs_embeds)[:-1]
seq_length = input_shapes[-1] seq_length = input_shapes[-1]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask] flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
transformer_outputs = self.transformer(flat_inputs, training=training) transformer_outputs = self.transformer(flat_inputs, training=training)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -48,13 +48,17 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -48,13 +48,17 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
def _embedding(self, inputs, training=False): def _embedding(self, inputs, training=False):
"""Applies embedding based on inputs tensor.""" """Applies embedding based on inputs tensor."""
input_ids, position_ids, token_type_ids = inputs input_ids, position_ids, token_type_ids, inputs_embeds = inputs
if input_ids is not None:
seq_length = tf.shape(input_ids)[1]
else:
seq_length = tf.shape(inputs_embeds)[1]
seq_length = tf.shape(input_ids)[1]
if position_ids is None: if position_ids is None:
position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training) return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
class TFRobertaMainLayer(TFBertMainLayer): class TFRobertaMainLayer(TFBertMainLayer):
...@@ -65,21 +69,8 @@ class TFRobertaMainLayer(TFBertMainLayer): ...@@ -65,21 +69,8 @@ class TFRobertaMainLayer(TFBertMainLayer):
super(TFRobertaMainLayer, self).__init__(config, **kwargs) super(TFRobertaMainLayer, self).__init__(config, **kwargs)
self.embeddings = TFRobertaEmbeddings(config, name='embeddings') self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
def call(self, inputs, **kwargs): def get_input_embeddings(self):
# Check that input_ids starts with control token return self.embeddings
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
else:
input_ids = inputs
if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
tf.print("A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your encoding.")
return super(TFRobertaMainLayer, self).call(inputs, **kwargs)
class TFRobertaPreTrainedModel(TFPreTrainedModel): class TFRobertaPreTrainedModel(TFPreTrainedModel):
...@@ -173,6 +164,10 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -173,6 +164,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -292,6 +287,9 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -292,6 +287,9 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
def get_output_embeddings(self):
return self.lm_head.decoder
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(inputs, **kwargs)
...@@ -371,3 +369,54 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): ...@@ -371,3 +369,54 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
outputs = (logits,) + outputs[2:] outputs = (logits,) + outputs[2:]
return outputs # logits, (hidden_states), (attentions) return outputs # logits, (hidden_states), (attentions)
@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForTokenClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, name='roberta')
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
def call(self, inputs, **kwargs):
outputs = self.roberta(inputs, **kwargs)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # scores, (hidden_states), (attentions)
...@@ -413,6 +413,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -413,6 +413,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
name='r_r_bias') name='r_r_bias')
super(TFTransfoXLMainLayer, self).build(input_shape) super(TFTransfoXLMainLayer, self).build(input_shape)
def get_input_embeddings(self):
return self.word_emb
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
return self.word_emb return self.word_emb
...@@ -427,11 +430,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -427,11 +430,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
def _prune_heads(self, heads): def _prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
def init_mems(self, data): def init_mems(self, bsz):
if self.mem_len > 0: if self.mem_len > 0:
mems = [] mems = []
for i in range(self.n_layer): for i in range(self.n_layer):
empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model]) empty = tf.zeros([self.mem_len, bsz, self.d_model])
mems.append(empty) mems.append(empty)
return mems return mems
...@@ -461,28 +464,37 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -461,28 +464,37 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
return new_mems return new_mems
def call(self, inputs, mems=None, head_mask=None, training=False): def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
assert len(inputs) <= 3, "Too many inputs." inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
assert len(inputs) <= 4, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
mems = inputs.get('mems', mems) mems = inputs.get('mems', mems)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 3, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 4, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = tf.transpose(input_ids, perm=(1, 0)) if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = tf.transpose(input_ids, perm=(1, 0))
qlen, bsz = shape_list(input_ids)
elif inputs_embeds is not None:
inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
qlen, bsz = shape_list(inputs_embeds)[:2]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if mems is None: if mems is None:
mems = self.init_mems(input_ids) mems = self.init_mems(bsz)
qlen, bsz = shape_list(input_ids)
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
...@@ -494,7 +506,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -494,7 +506,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
else: else:
head_mask = [None] * self.n_layer head_mask = [None] * self.n_layer
word_emb = self.word_emb(input_ids) if inputs_embeds is not None:
word_emb = inputs_embeds
else:
word_emb = self.word_emb(input_ids)
mlen = shape_list(mems[0])[0] if mems is not None else 0 mlen = shape_list(mems[0])[0] if mems is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -626,6 +641,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -626,6 +641,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -716,28 +735,33 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -716,28 +735,33 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
def reset_length(self, tgt_len, ext_len, mem_len): def reset_length(self, tgt_len, ext_len, mem_len):
self.transformer.reset_length(tgt_len, ext_len, mem_len) self.transformer.reset_length(tgt_len, ext_len, mem_len)
def init_mems(self, data): def init_mems(self, bsz):
return self.transformer.init_mems(data) return self.transformer.init_mems(bsz)
def call(self, inputs, mems=None, head_mask=None, labels=None, training=False): def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, labels=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
labels = inputs[3] if len(inputs) > 3 else labels inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
assert len(inputs) <= 4, "Too many inputs." labels = inputs[4] if len(inputs) > 4 else labels
assert len(inputs) <= 5, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
mems = inputs.get('mems', mems) mems = inputs.get('mems', mems)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
labels = inputs.get('labels', labels) labels = inputs.get('labels', labels)
assert len(inputs) <= 4, "Too many inputs." assert len(inputs) <= 5, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
bsz, tgt_len = shape_list(input_ids)[:2] if input_ids is not None:
bsz, tgt_len = shape_list(input_ids)[:2]
else:
bsz, tgt_len = shape_list(inputs_embeds)[:2]
transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training) transformer_outputs = self.transformer([input_ids, mems, head_mask, inputs_embeds], training=training)
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:] pred_hid = last_hidden[:, -tgt_len:]
......
...@@ -35,7 +35,7 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -35,7 +35,7 @@ class TFPreTrainedModel(tf.keras.Model):
r""" Base class for all TF models. r""" Base class for all TF models.
:class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes): Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
...@@ -65,6 +65,21 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -65,6 +65,21 @@ class TFPreTrainedModel(tf.keras.Model):
# Save config in model # Save config in model
self.config = config self.config = config
def get_input_embeddings(self):
""" Get model's input embeddings
"""
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
return base_model.get_input_embeddings()
else:
raise NotImplementedError
def get_output_embeddings(self):
""" Get model's output embeddings
Return None if the model doesn't have output embeddings
"""
return None # Overwrite for models with output embeddings
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Variable from a provided token Embedding Module. """ Build a resized Embedding Variable from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end Increasing the size will add newly initialized vectors at the end
...@@ -483,10 +498,10 @@ def shape_list(x): ...@@ -483,10 +498,10 @@ def shape_list(x):
return [dynamic[i] if s is None else s for i, s in enumerate(static)] return [dynamic[i] if s is None else s for i, s in enumerate(static)]
def get_initializer(initializer_range=0.02): def get_initializer(initializer_range=0.02):
"""Creates a `tf.initializers.truncated_normal` with the given range. """Creates a `tf.initializers.truncated_normal` with the given range.
Args: Args:
initializer_range: float, initializer range for stddev. initializer_range: float, initializer range for stddev.
Returns: Returns:
TruncatedNormal initializer with stddev = `initializer_range`. TruncatedNormal initializer with stddev = `initializer_range`.
""" """
return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
...@@ -84,7 +84,8 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32): ...@@ -84,7 +84,8 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
attn_mask = mask attn_mask = mask
# sanity check # sanity check
assert shape_list(mask) == [bs, slen] # assert shape_list(mask) == [bs, slen]
tf.debugging.assert_equal(shape_list(mask), [bs, slen])
assert causal is False or shape_list(attn_mask) == [bs, slen, slen] assert causal is False or shape_list(attn_mask) == [bs, slen, slen]
mask = tf.cast(mask, dtype=dtype) mask = tf.cast(mask, dtype=dtype)
...@@ -276,6 +277,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -276,6 +277,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
self.prune_heads({int(layer): list(map(int, heads))}) self.prune_heads({int(layer): list(map(int, heads))})
def get_input_embeddings(self):
return self.embeddings
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError raise NotImplementedError
...@@ -287,7 +291,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -287,7 +291,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
raise NotImplementedError raise NotImplementedError
def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None, def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
position_ids=None, lengths=None, cache=None, head_mask=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None,
training=False): # removed: src_enc=None, src_len=None training=False): # removed: src_enc=None, src_len=None
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -298,7 +302,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -298,7 +302,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
lengths = inputs[5] if len(inputs) > 5 else lengths lengths = inputs[5] if len(inputs) > 5 else lengths
cache = inputs[6] if len(inputs) > 6 else cache cache = inputs[6] if len(inputs) > 6 else cache
head_mask = inputs[7] if len(inputs) > 7 else head_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
...@@ -308,17 +313,30 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -308,17 +313,30 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
lengths = inputs.get('lengths', lengths) lengths = inputs.get('lengths', lengths)
cache = inputs.get('cache', cache) cache = inputs.get('cache', cache)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
bs, slen = shape_list(input_ids)
elif inputs_embeds is not None:
bs, slen = shape_list(inputs_embeds)[:2]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if lengths is None: if lengths is None:
lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) if input_ids is not None:
lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
else:
lengths = tf.convert_to_tensor([slen]*bs, tf.int32)
# mask = input_ids != self.pad_index # mask = input_ids != self.pad_index
# check inputs # check inputs
bs, slen = shape_list(input_ids) # assert shape_list(lengths)[0] == bs
assert shape_list(lengths)[0] == bs tf.debugging.assert_equal(shape_list(lengths)[0], bs)
# assert lengths.max().item() <= slen # assert lengths.max().item() <= slen
# input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0
# assert (src_enc is None) == (src_len is None) # assert (src_enc is None) == (src_len is None)
...@@ -335,12 +353,14 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -335,12 +353,14 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
if position_ids is None: if position_ids is None:
position_ids = tf.expand_dims(tf.range(slen), axis=0) position_ids = tf.expand_dims(tf.range(slen), axis=0)
else: else:
assert shape_list(position_ids) == [bs, slen] # (slen, bs) # assert shape_list(position_ids) == [bs, slen] # (slen, bs)
tf.debugging.assert_equal(shape_list(position_ids), [bs, slen])
# position_ids = position_ids.transpose(0, 1) # position_ids = position_ids.transpose(0, 1)
# langs # langs
if langs is not None: if langs is not None:
assert shape_list(langs) == [bs, slen] # (slen, bs) # assert shape_list(langs) == [bs, slen] # (slen, bs)
tf.debugging.assert_equal(shape_list(langs), [bs, slen])
# langs = langs.transpose(0, 1) # langs = langs.transpose(0, 1)
# Prepare head mask if needed # Prepare head mask if needed
...@@ -354,7 +374,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -354,7 +374,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.n_layers head_mask = [None] * self.n_layers
# do not recompute cached elements # do not recompute cached elements
if cache is not None: if cache is not None and input_ids is not None:
_slen = slen - cache['slen'] _slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:] input_ids = input_ids[:, -_slen:]
position_ids = position_ids[:, -_slen:] position_ids = position_ids[:, -_slen:]
...@@ -364,8 +384,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -364,8 +384,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
attn_mask = attn_mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:]
# embeddings # embeddings
tensor = self.embeddings(input_ids) if inputs_embeds is None:
tensor = tensor + self.position_embeddings(position_ids) inputs_embeds = self.embeddings(input_ids)
tensor = inputs_embeds + self.position_embeddings(position_ids)
if langs is not None and self.use_lang_emb: if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs) tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None: if token_type_ids is not None:
...@@ -526,6 +548,10 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -526,6 +548,10 @@ XLM_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -633,6 +659,8 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -633,6 +659,8 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
self.transformer = TFXLMMainLayer(config, name='transformer') self.transformer = TFXLMMainLayer(config, name='transformer')
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj') self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
def get_output_embeddings(self):
return self.pred_layer.input_embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
......
...@@ -371,6 +371,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -371,6 +371,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)] self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
def get_input_embeddings(self):
return self.word_embedding
def build(self, input_shape): def build(self, input_shape):
initializer = get_initializer(self.initializer_range) initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight(shape=(1, 1, self.d_model), self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
...@@ -484,7 +487,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -484,7 +487,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
return pos_emb return pos_emb
def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, training=False): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
...@@ -494,7 +497,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -494,7 +497,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
input_mask = inputs[6] if len(inputs) > 6 else input_mask input_mask = inputs[6] if len(inputs) > 6 else input_mask
head_mask = inputs[7] if len(inputs) > 7 else head_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
...@@ -504,7 +508,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -504,7 +508,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
input_mask = inputs.get('input_mask', input_mask) input_mask = inputs.get('input_mask', input_mask)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
...@@ -512,14 +517,23 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -512,14 +517,23 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end # so we move here the first dimension (batch) to the end
input_ids = tf.transpose(input_ids, perm=(1, 0)) if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = tf.transpose(input_ids, perm=(1, 0))
qlen, bsz = shape_list(input_ids)[:2]
elif inputs_embeds is not None:
inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
qlen, bsz = shape_list(inputs_embeds)[:2]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
qlen, bsz = shape_list(input_ids)[:2]
mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0 mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -570,7 +584,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -570,7 +584,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
non_tgt_mask = None non_tgt_mask = None
##### Word embeddings and prepare h & g hidden states ##### Word embeddings and prepare h & g hidden states
word_emb_k = self.word_embedding(input_ids) if inputs_embeds is not None:
word_emb_k = inputs_embeds
else:
word_emb_k = self.word_embedding(input_ids)
output_h = self.dropout(word_emb_k, training=training) output_h = self.dropout(word_emb_k, training=training)
if target_mapping is not None: if target_mapping is not None:
word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
...@@ -762,6 +779,10 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -762,6 +779,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -850,6 +871,9 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -850,6 +871,9 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
self.transformer = TFXLNetMainLayer(config, name='transformer') self.transformer = TFXLNetMainLayer(config, name='transformer')
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss') self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
def get_output_embeddings(self):
return self.lm_loss.input_embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_state = transformer_outputs[0] hidden_state = transformer_outputs[0]
......
...@@ -553,6 +553,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -553,6 +553,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -639,9 +643,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -639,9 +643,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
self.init_weights() self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def get_input_embeddings(self):
return self.word_emb return self.word_emb
def set_input_embeddings(self, new_embeddings):
self.word_emb = new_embeddings
def backward_compatible(self): def backward_compatible(self):
self.sample_softmax = -1 self.sample_softmax = -1
...@@ -654,12 +661,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -654,12 +661,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
logger.info("Head pruning is not implemented for Transformer-XL model") logger.info("Head pruning is not implemented for Transformer-XL model")
pass pass
def init_mems(self, data): def init_mems(self, bsz):
if self.mem_len > 0: if self.mem_len > 0:
mems = [] mems = []
param = next(self.parameters()) param = next(self.parameters())
for i in range(self.n_layer): for i in range(self.n_layer):
empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model, empty = torch.zeros(self.mem_len, bsz, self.config.d_model,
dtype=param.dtype, device=param.device) dtype=param.dtype, device=param.device)
mems.append(empty) mems.append(empty)
...@@ -690,15 +697,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -690,15 +697,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems return new_mems
def forward(self, input_ids, mems=None, head_mask=None): def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None):
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous() if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = input_ids.transpose(0, 1).contiguous()
qlen, bsz = input_ids.size()
elif inputs_embeds is not None:
inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if mems is None: if mems is None:
mems = self.init_mems(input_ids) mems = self.init_mems(bsz)
qlen, bsz = input_ids.size()
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
...@@ -715,7 +729,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -715,7 +729,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
else: else:
head_mask = [None] * self.n_layer head_mask = [None] * self.n_layer
word_emb = self.word_emb(input_ids) if inputs_embeds is not None:
word_emb = inputs_embeds
else:
word_emb = self.word_emb(input_ids)
mlen = mems[0].size(0) if mems is not None else 0 mlen = mems[0].size(0) if mems is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -826,7 +843,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -826,7 +843,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
config.cutoffs, div_val=config.div_val) config.cutoffs, div_val=config.div_val)
self.init_weights() self.init_weights()
self.tie_weights()
def tie_weights(self): def tie_weights(self):
""" """
...@@ -858,14 +874,18 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -858,14 +874,18 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
def reset_length(self, tgt_len, ext_len, mem_len): def reset_length(self, tgt_len, ext_len, mem_len):
self.transformer.reset_length(tgt_len, ext_len, mem_len) self.transformer.reset_length(tgt_len, ext_len, mem_len)
def init_mems(self, data): def init_mems(self, bsz):
return self.transformer.init_mems(data) return self.transformer.init_mems(bsz)
def forward(self, input_ids, mems=None, head_mask=None, labels=None): def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, labels=None):
bsz = input_ids.size(0) if input_ids is not None:
tgt_len = input_ids.size(1) bsz, tgt_len = input_ids.size(0), input_ids.size(1)
elif inputs_embeds is not None:
bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask) transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask, inputs_embeds=inputs_embeds)
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:] pred_hid = last_hidden[:, -tgt_len:]
......
...@@ -53,7 +53,7 @@ class PreTrainedModel(nn.Module): ...@@ -53,7 +53,7 @@ class PreTrainedModel(nn.Module):
r""" Base class for all models. r""" Base class for all models.
:class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes): Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
...@@ -83,55 +83,59 @@ class PreTrainedModel(nn.Module): ...@@ -83,55 +83,59 @@ class PreTrainedModel(nn.Module):
# Save config in model # Save config in model
self.config = config self.config = config
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): @property
""" Build a resized Embedding Module from a provided token Embedding Module. def base_model(self):
Increasing the size will add newly initialized vectors at the end return getattr(self, self.base_model_prefix, self)
Reducing the size will remove vectors from the end
Args: def get_input_embeddings(self):
new_num_tokens: (`optional`) int """ Get model's input embeddings
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: return the provided token Embedding Module.
Return: ``torch.nn.Embeddings``
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
""" """
if new_num_tokens is None: base_model = getattr(self, self.base_model_prefix, self)
return old_embeddings if base_model is not self:
return base_model.get_input_embeddings()
old_num_tokens, old_embedding_dim = old_embeddings.weight.size() else:
if old_num_tokens == new_num_tokens: raise NotImplementedError
return old_embeddings
# Build new embeddings
new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
new_embeddings.to(old_embeddings.weight.device)
# initialize all new embeddings (in particular added tokens) def set_input_embeddings(self, value):
self._init_weights(new_embeddings) """ Set model's input embeddings
"""
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
base_model.set_input_embeddings(value)
else:
raise NotImplementedError
# Copy word embeddings from the previous weights def get_output_embeddings(self):
num_tokens_to_copy = min(old_num_tokens, new_num_tokens) """ Get model's output embeddings
new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] Return None if the model doesn't have output embeddings
"""
return None # Overwrite for models with output embeddings
return new_embeddings def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
output_embeddings = self.get_output_embeddings()
if output_embeddings is not None:
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
def _tie_or_clone_weights(self, first_module, second_module): def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
""" Tie or clone module weights depending of weither we are using TorchScript or not """ Tie or clone module weights depending of weither we are using TorchScript or not
""" """
if self.config.torchscript: if self.config.torchscript:
first_module.weight = nn.Parameter(second_module.weight.clone()) output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
else: else:
first_module.weight = second_module.weight output_embeddings.weight = input_embeddings.weight
if hasattr(first_module, 'bias') and first_module.bias is not None: if hasattr(output_embeddings, 'bias') and output_embeddings.bias is not None:
first_module.bias.data = torch.nn.functional.pad( output_embeddings.bias.data = torch.nn.functional.pad(
first_module.bias.data, output_embeddings.bias.data,
(0, first_module.weight.shape[0] - first_module.bias.shape[0]), (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
'constant', 'constant',
0 0
) )
if hasattr(output_embeddings, 'out_features') and hasattr(input_embeddings, 'num_embeddings'):
output_embeddings.out_features = input_embeddings.num_embeddings
def resize_token_embeddings(self, new_num_tokens=None): def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
...@@ -161,6 +165,46 @@ class PreTrainedModel(nn.Module): ...@@ -161,6 +165,46 @@ class PreTrainedModel(nn.Module):
return model_embeds return model_embeds
def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self.get_input_embeddings()
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
self.set_input_embeddings(new_embeddings)
return self.get_input_embeddings()
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Module from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
Args:
new_num_tokens: (`optional`) int
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: return the provided token Embedding Module.
Return: ``torch.nn.Embeddings``
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
"""
if new_num_tokens is None:
return old_embeddings
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
if old_num_tokens == new_num_tokens:
return old_embeddings
# Build new embeddings
new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
new_embeddings.to(old_embeddings.weight.device)
# initialize all new embeddings (in particular added tokens)
self._init_weights(new_embeddings)
# Copy word embeddings from the previous weights
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
return new_embeddings
def init_weights(self): def init_weights(self):
""" Initialize and prunes weights if needed. """ """ Initialize and prunes weights if needed. """
# Initialize weights # Initialize weights
...@@ -170,6 +214,9 @@ class PreTrainedModel(nn.Module): ...@@ -170,6 +214,9 @@ class PreTrainedModel(nn.Module):
if self.config.pruned_heads: if self.config.pruned_heads:
self.prune_heads(self.config.pruned_heads) self.prune_heads(self.config.pruned_heads)
# Tie weights if needed
self.tie_weights()
def prune_heads(self, heads_to_prune): def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model. """ Prunes heads of the base model.
...@@ -178,14 +225,12 @@ class PreTrainedModel(nn.Module): ...@@ -178,14 +225,12 @@ class PreTrainedModel(nn.Module):
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
""" """
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON
base_model._prune_heads(heads_to_prune) self.base_model._prune_heads(heads_to_prune)
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it """ Save a model and its configuration file to a directory, so that it
...@@ -193,7 +238,7 @@ class PreTrainedModel(nn.Module): ...@@ -193,7 +238,7 @@ class PreTrainedModel(nn.Module):
""" """
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
# Only save the model it-self if we are using distributed training # Only save the model itself if we are using distributed training
model_to_save = self.module if hasattr(self, 'module') else self model_to_save = self.module if hasattr(self, 'module') else self
# Save configuration file # Save configuration file
...@@ -273,6 +318,10 @@ class PreTrainedModel(nn.Module): ...@@ -273,6 +318,10 @@ class PreTrainedModel(nn.Module):
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
""" """
if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
"https://github.com/google-research/google-research/issues/119 for more information.")
config = kwargs.pop('config', None) config = kwargs.pop('config', None)
state_dict = kwargs.pop('state_dict', None) state_dict = kwargs.pop('state_dict', None)
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
...@@ -289,6 +338,7 @@ class PreTrainedModel(nn.Module): ...@@ -289,6 +338,7 @@ class PreTrainedModel(nn.Module):
cache_dir=cache_dir, return_unused_kwargs=True, cache_dir=cache_dir, return_unused_kwargs=True,
force_download=force_download, force_download=force_download,
resume_download=resume_download, resume_download=resume_download,
proxies=proxies,
**kwargs **kwargs
) )
else: else:
...@@ -389,6 +439,8 @@ class PreTrainedModel(nn.Module): ...@@ -389,6 +439,8 @@ class PreTrainedModel(nn.Module):
if metadata is not None: if metadata is not None:
state_dict._metadata = metadata state_dict._metadata = metadata
# PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
# so we need to apply the function recursively.
def load(module, prefix=''): def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict( module._load_from_state_dict(
......
...@@ -73,15 +73,15 @@ def get_masks(slen, lengths, causal, padding_mask=None): ...@@ -73,15 +73,15 @@ def get_masks(slen, lengths, causal, padding_mask=None):
""" """
Generate hidden states mask, and optionally an attention mask. Generate hidden states mask, and optionally an attention mask.
""" """
bs = lengths.size(0) alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
if padding_mask is not None: if padding_mask is not None:
mask = padding_mask mask = padding_mask
else: else:
assert lengths.max().item() <= slen assert lengths.max().item() <= slen
alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
mask = alen < lengths[:, None] mask = alen < lengths[:, None]
# attention mask is the same as mask, or triangular inferior attention (causal) # attention mask is the same as mask, or triangular inferior attention (causal)
bs = lengths.size(0)
if causal: if causal:
attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None] attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
else: else:
...@@ -311,6 +311,10 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -311,6 +311,10 @@ XLM_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -407,10 +411,12 @@ class XLMModel(XLMPreTrainedModel): ...@@ -407,10 +411,12 @@ class XLMModel(XLMPreTrainedModel):
self.init_weights() self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def get_input_embeddings(self):
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
return self.embeddings return self.embeddings
def set_input_embeddings(self, new_embeddings):
self.embeddings = new_embeddings
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """ Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
...@@ -419,14 +425,21 @@ class XLMModel(XLMPreTrainedModel): ...@@ -419,14 +425,21 @@ class XLMModel(XLMPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads) self.attentions[layer].prune_heads(heads)
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None): # removed: src_enc=None, src_len=None lengths=None, cache=None, head_mask=None, inputs_embeds=None): # removed: src_enc=None, src_len=None
if input_ids is not None:
bs, slen = input_ids.size()
else:
bs, slen = inputs_embeds.size()[:-1]
if lengths is None: if lengths is None:
lengths = (input_ids != self.pad_index).sum(dim=1).long() if input_ids is not None:
lengths = (input_ids != self.pad_index).sum(dim=1).long()
else:
lengths = torch.LongTensor([slen]*bs)
# mask = input_ids != self.pad_index # mask = input_ids != self.pad_index
# check inputs # check inputs
bs, slen = input_ids.size()
assert lengths.size(0) == bs assert lengths.size(0) == bs
assert lengths.max().item() <= slen assert lengths.max().item() <= slen
# input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0
...@@ -440,10 +453,12 @@ class XLMModel(XLMPreTrainedModel): ...@@ -440,10 +453,12 @@ class XLMModel(XLMPreTrainedModel):
# if self.is_decoder and src_enc is not None: # if self.is_decoder and src_enc is not None:
# src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
device = input_ids.device if input_ids is not None else inputs_embeds.device
# position_ids # position_ids
if position_ids is None: if position_ids is None:
position_ids = input_ids.new((slen,)).long() position_ids = torch.arange(slen, dtype=torch.long, device=device)
position_ids = torch.arange(slen, out=position_ids).unsqueeze(0) position_ids = position_ids.unsqueeze(0).expand((bs, slen))
else: else:
assert position_ids.size() == (bs, slen) # (slen, bs) assert position_ids.size() == (bs, slen) # (slen, bs)
# position_ids = position_ids.transpose(0, 1) # position_ids = position_ids.transpose(0, 1)
...@@ -469,7 +484,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -469,7 +484,7 @@ class XLMModel(XLMPreTrainedModel):
head_mask = [None] * self.n_layers head_mask = [None] * self.n_layers
# do not recompute cached elements # do not recompute cached elements
if cache is not None: if cache is not None and input_ids is not None:
_slen = slen - cache['slen'] _slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:] input_ids = input_ids[:, -_slen:]
position_ids = position_ids[:, -_slen:] position_ids = position_ids[:, -_slen:]
...@@ -479,8 +494,10 @@ class XLMModel(XLMPreTrainedModel): ...@@ -479,8 +494,10 @@ class XLMModel(XLMPreTrainedModel):
attn_mask = attn_mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:]
# embeddings # embeddings
tensor = self.embeddings(input_ids) if inputs_embeds is None:
tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor) inputs_embeds = self.embeddings(input_ids)
tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
if langs is not None and self.use_lang_emb: if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs) tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None: if token_type_ids is not None:
...@@ -618,15 +635,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -618,15 +635,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
self.pred_layer = XLMPredLayer(config) self.pred_layer = XLMPredLayer(config)
self.init_weights() self.init_weights()
self.tie_weights()
def tie_weights(self): def get_output_embeddings(self):
""" Make sure we are sharing the embeddings return self.pred_layer.proj
"""
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, labels=None): lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
langs=langs, langs=langs,
...@@ -634,7 +648,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -634,7 +648,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
outputs = self.pred_layer(output, labels) outputs = self.pred_layer(output, labels)
...@@ -686,8 +701,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -686,8 +701,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, labels=None): lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
langs=langs, langs=langs,
...@@ -695,7 +710,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -695,7 +710,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
...@@ -769,8 +785,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -769,8 +785,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None): lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
langs=langs, langs=langs,
...@@ -778,7 +794,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -778,7 +794,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = transformer_outputs[0] sequence_output = transformer_outputs[0]
...@@ -864,8 +881,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -864,8 +881,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None,
is_impossible=None, cls_index=None, p_mask=None): is_impossible=None, cls_index=None, p_mask=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -874,7 +891,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -874,7 +891,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
......
...@@ -558,6 +558,10 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -558,6 +558,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -611,10 +615,12 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -611,10 +615,12 @@ class XLNetModel(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def get_input_embeddings(self):
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
return self.word_embedding return self.word_embedding
def set_input_embeddings(self, new_embeddings):
self.word_embedding = new_embeddings
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError raise NotImplementedError
...@@ -710,19 +716,29 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -710,19 +716,29 @@ class XLNetModel(XLNetPreTrainedModel):
pos_emb = pos_emb.to(next(self.parameters())) pos_emb = pos_emb.to(next(self.parameters()))
return pos_emb return pos_emb
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None):
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end # so we move here the first dimension (batch) to the end
input_ids = input_ids.transpose(0, 1).contiguous() if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = input_ids.transpose(0, 1).contiguous()
qlen, bsz = input_ids.shape[0], input_ids.shape[1]
elif inputs_embeds is not None:
inputs_embeds.transpose(0, 1).contiguous()
qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
qlen, bsz = input_ids.shape[0], input_ids.shape[1]
mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0 mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -775,7 +791,10 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -775,7 +791,10 @@ class XLNetModel(XLNetPreTrainedModel):
non_tgt_mask = None non_tgt_mask = None
##### Word embeddings and prepare h & g hidden states ##### Word embeddings and prepare h & g hidden states
word_emb_k = self.word_embedding(input_ids) if inputs_embeds is not None:
word_emb_k = inputs_embeds
else:
word_emb_k = self.word_embedding(input_ids)
output_h = self.dropout(word_emb_k) output_h = self.dropout(word_emb_k)
if target_mapping is not None: if target_mapping is not None:
word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1) word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
...@@ -918,15 +937,12 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -918,15 +937,12 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True) self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
self.init_weights() self.init_weights()
self.tie_weights()
def tie_weights(self): def get_output_embeddings(self):
""" Make sure we are sharing the embeddings return self.lm_loss
"""
self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, labels=None): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
mems=mems, mems=mems,
...@@ -934,7 +950,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -934,7 +950,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
logits = self.lm_loss(transformer_outputs[0]) logits = self.lm_loss(transformer_outputs[0])
...@@ -999,8 +1016,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -999,8 +1016,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, labels=None): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
mems=mems, mems=mems,
...@@ -1008,7 +1025,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1008,7 +1025,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
output = self.sequence_summary(output) output = self.sequence_summary(output)
...@@ -1050,6 +1068,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1050,6 +1068,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
...@@ -1094,9 +1116,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1094,9 +1116,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None, mems=None, perm_mask=None, target_mapping=None,
labels=None, head_mask=None): labels=None, head_mask=None, inputs_embeds=None):
num_choices = input_ids.shape[1] num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_input_ids = input_ids.view(-1, input_ids.size(-1))
...@@ -1107,7 +1129,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1107,7 +1129,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids, transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
input_mask=flat_input_mask, attention_mask=flat_attention_mask, input_mask=flat_input_mask, attention_mask=flat_attention_mask,
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
head_mask=head_mask) head_mask=head_mask, inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1179,8 +1201,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1179,8 +1201,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
start_positions=None, end_positions=None): start_positions=None, end_positions=None):
outputs = self.transformer(input_ids, outputs = self.transformer(input_ids,
...@@ -1190,7 +1212,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1190,7 +1212,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1295,8 +1318,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1295,8 +1318,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,): start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -1305,7 +1328,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1305,7 +1328,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
start_logits = self.start_logits(hidden_states, p_mask=p_mask) start_logits = self.start_logits(hidden_states, p_mask=p_mask)
......
...@@ -23,86 +23,66 @@ from torch.optim.lr_scheduler import LambdaLR ...@@ -23,86 +23,66 @@ from torch.optim.lr_scheduler import LambdaLR
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ConstantLRSchedule(LambdaLR):
""" Constant learning rate schedule. def get_constant_schedule(optimizer, last_epoch=-1):
""" Create a schedule with a constant learning rate.
""" """
def __init__(self, optimizer, last_epoch=-1): return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
class WarmupConstantSchedule(LambdaLR): def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
""" Linear warmup and then constant. """ Create a schedule with a constant learning rate preceded by a warmup
Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps. period during which the learning rate increases linearly between 0 and 1.
Keeps learning rate schedule equal to 1. after warmup_steps.
""" """
def __init__(self, optimizer, warmup_steps, last_epoch=-1): def lr_lambda(current_step):
self.warmup_steps = warmup_steps if current_step < num_warmup_steps:
super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) return float(current_step) / float(max(1.0, num_warmup_steps))
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1.0, self.warmup_steps))
return 1. return 1.
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupLinearSchedule(LambdaLR):
""" Linear warmup and then linear decay. def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. """ Create a schedule with a learning rate that decreases linearly after
Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps. linearly increasing during a warmup period.
"""
def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
self.warmup_steps = warmup_steps
self.t_total = t_total
super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1, self.warmup_steps))
return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
class WarmupCosineSchedule(LambdaLR):
""" Linear warmup and then cosine decay.
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
""" """
def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1): def lr_lambda(current_step):
self.warmup_steps = warmup_steps if current_step < num_warmup_steps:
self.t_total = t_total return float(current_step) / float(max(1, num_warmup_steps))
self.cycles = cycles return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1.0, self.warmup_steps)) def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
# progress after warmup """ Create a schedule with a learning rate that decreases following the
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) values of the cosine function between 0 and `pi * cycles` after a warmup
return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) period during which it increases linearly between 0 and 1.
class WarmupCosineWithHardRestartsSchedule(LambdaLR):
""" Linear warmup and then cosine cycles with hard restarts.
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
learning rate (with hard restarts).
""" """
def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1): def lr_lambda(current_step):
self.warmup_steps = warmup_steps if current_step < num_warmup_steps:
self.t_total = t_total return float(current_step) / float(max(1, num_warmup_steps))
self.cycles = cycles progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
def lr_lambda(self, step): return LambdaLR(optimizer, lr_lambda, last_epoch)
if step < self.warmup_steps:
return float(step) / float(max(1, self.warmup_steps))
# progress after warmup
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
if progress >= 1.0:
return 0.0
return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
""" Create a schedule with a learning rate that decreases following the
values of the cosine function with several hard restarts, after a warmup
period during which it increases linearly between 0 and 1.
"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
if progress >= 1.:
return 0.
return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
return LambdaLR(optimizer, lr_lambda, last_epoch)
class AdamW(Optimizer): class AdamW(Optimizer):
""" Implements Adam algorithm with weight decay fix. """ Implements Adam algorithm with weight decay fix.
......
...@@ -7,6 +7,13 @@ def pytest_addoption(parser): ...@@ -7,6 +7,13 @@ def pytest_addoption(parser):
parser.addoption( parser.addoption(
"--runslow", action="store_true", default=False, help="run slow tests" "--runslow", action="store_true", default=False, help="run slow tests"
) )
parser.addoption(
"--use_cuda", action="store_true", default=False, help="run tests on gpu"
)
def pytest_configure(config):
config.addinivalue_line("markers", "slow: mark test as slow to run")
def pytest_collection_modifyitems(config, items): def pytest_collection_modifyitems(config, items):
...@@ -17,3 +24,8 @@ def pytest_collection_modifyitems(config, items): ...@@ -17,3 +24,8 @@ def pytest_collection_modifyitems(config, items):
for item in items: for item in items:
if "slow" in item.keywords: if "slow" in item.keywords:
item.add_marker(skip_slow) item.add_marker(skip_slow)
@pytest.fixture
def use_cuda(request):
""" Run test on gpu """
return request.config.getoption("--use_cuda")
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
import pytest
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
if is_torch_available():
from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
AlbertForSequenceClassification, AlbertForQuestionAnswering,
)
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
else:
pytestmark = pytest.mark.skip("Require Torch")
class AlbertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
class AlbertModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
embedding_size=16,
hidden_size=36,
num_hidden_layers=6,
num_hidden_groups=6,
num_attention_heads=6,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
self.num_hidden_groups = num_hidden_groups
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = AlbertConfig(
vocab_size_or_config_json_file=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range,
num_hidden_groups=self.num_hidden_groups)
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
[])
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertModel(config=config)
model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids)
result = {
"sequence_output": sequence_output,
"pooled_output": pooled_output,
}
self.parent.assertListEqual(
list(result["sequence_output"].size()),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertForMaskedLM(config=config)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = {
"loss": loss,
"prediction_scores": prediction_scores,
}
self.parent.assertListEqual(
list(result["prediction_scores"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
self.check_loss_output(result)
def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertForQuestionAnswering(config=config)
model.eval()
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
start_positions=sequence_labels, end_positions=sequence_labels)
result = {
"loss": loss,
"start_logits": start_logits,
"end_logits": end_logits,
}
self.parent.assertListEqual(
list(result["start_logits"].size()),
[self.batch_size, self.seq_length])
self.parent.assertListEqual(
list(result["end_logits"].size()),
[self.batch_size, self.seq_length])
self.check_loss_output(result)
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = AlbertForSequenceClassification(config)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
result = {
"loss": loss,
"logits": logits,
}
self.parent.assertListEqual(
list(result["logits"].size()),
[self.batch_size, self.num_labels])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_mask,
sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = AlbertModelTest.AlbertModelTester(self)
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_albert_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_model(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment