Commit 72863735 authored by thomwolf's avatar thomwolf
Browse files

WIP GPT2

parent 34f28b2a
...@@ -704,20 +704,7 @@ class TFBertModel(TFBertPreTrainedModel): ...@@ -704,20 +704,7 @@ class TFBertModel(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class TFBertForPreTraining(TFBertPreTrainedModel): class TFBertForPreTraining(TFBertPreTrainedModel):
r""" r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
**next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
Indices should be in ``[0, 1]``.
``0`` indicates sequence B is a continuation of sequence A,
``1`` indicates sequence B is a random sequence.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
...@@ -762,15 +749,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -762,15 +749,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class TFBertForMaskedLM(TFBertPreTrainedModel): class TFBertForMaskedLM(TFBertPreTrainedModel):
r""" r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...@@ -786,8 +765,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -786,8 +765,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased') model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids)
loss, prediction_scores = outputs[:2] prediction_scores = outputs[:2]
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
...@@ -811,12 +790,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -811,12 +790,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class TFBertForNextSentencePrediction(TFBertPreTrainedModel): class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
r""" r"""
**next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
Indices should be in ``[0, 1]``.
``0`` indicates sequence B is a continuation of sequence A,
``1`` indicates sequence B is a random sequence.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Next sequence prediction (classification) loss. Next sequence prediction (classification) loss.
...@@ -862,15 +835,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): ...@@ -862,15 +835,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class TFBertForSequenceClassification(TFBertPreTrainedModel): class TFBertForSequenceClassification(TFBertPreTrainedModel):
r""" r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification (or regression if config.num_labels==1) loss.
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...@@ -886,8 +851,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -886,8 +851,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids)
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
...@@ -905,7 +869,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -905,7 +869,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output) if training:
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
...@@ -915,53 +880,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -915,53 +880,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of @add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
BERT_START_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class TFBertForMultipleChoice(TFBertPreTrainedModel): class TFBertForMultipleChoice(TFBertPreTrainedModel):
r""" r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above). of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
...@@ -979,8 +901,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -979,8 +901,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
model = BertForMultipleChoice.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids)
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2] loss, classification_scores = outputs[:2]
""" """
...@@ -1025,7 +946,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -1025,7 +946,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output) if training:
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
...@@ -1039,13 +961,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -1039,13 +961,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class TFBertForTokenClassification(TFBertPreTrainedModel): class TFBertForTokenClassification(TFBertPreTrainedModel):
r""" r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax). Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...@@ -1061,8 +977,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1061,8 +977,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids)
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2] loss, scores = outputs[:2]
""" """
...@@ -1080,7 +995,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1080,7 +995,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
sequence_output = outputs[0] sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output) if training:
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
...@@ -1093,18 +1009,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1093,18 +1009,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class TFBertForQuestionAnswering(TFBertPreTrainedModel): class TFBertForQuestionAnswering(TFBertPreTrainedModel):
r""" r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
......
...@@ -28,13 +28,13 @@ from io import open ...@@ -28,13 +28,13 @@ from io import open
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from .modeling_tf_utils import TFPreTrainedModel, TFConv1D from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5", TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"} "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
...@@ -139,7 +139,7 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -139,7 +139,7 @@ class TFAttention(tf.keras.layers.Layer):
@staticmethod @staticmethod
@tf.function @tf.function
def attention_mask(nd, ns, dtype): def causal_attention_mask(nd, ns, dtype):
"""1's in the lower triangle, counting from the lower right corner. """1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
""" """
...@@ -150,20 +150,24 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -150,20 +150,24 @@ class TFAttention(tf.keras.layers.Layer):
@tf.function @tf.function
def _attn(self, inputs, training=False): def _attn(self, inputs, training=False):
q, k, v, head_mask = inputs q, k, v, attention_mask, head_mask = inputs
# q, k, v have shape [batch, heads, sequence, features] # q, k, v have shape [batch, heads, sequence, features]
w = tf.matmul(q, k, transpose_b=True) w = tf.matmul(q, k, transpose_b=True)
if self.scale: if self.scale:
n_state = shape_list(v)[-1] dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype)) w = w / tf.math.sqrt(dk)
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
_, _, nd, ns = shape_list(w) _, _, nd, ns = shape_list(w)
b = self.attention_mask(nd, ns, dtype=w.dtype) b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
b = tf.reshape(b, [1, 1, nd, ns]) b = tf.reshape(b, [1, 1, nd, ns])
w = w * b - 1e4 * (1 - b) w = w * b - 1e4 * (1 - b)
w = tf.nn.softmax(w) if attention_mask is not None:
# Apply the attention mask
w = w + attention_mask
w = tf.nn.softmax(w, axis=-1)
if training: if training:
w = self.attn_dropout(w) w = self.attn_dropout(w)
...@@ -179,20 +183,20 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -179,20 +183,20 @@ class TFAttention(tf.keras.layers.Layer):
@tf.function @tf.function
def merge_heads(self, x): def merge_heads(self, x):
x = tf.transpose(x, [0, 2, 1, 3]) x = tf.transpose(x, [0, 2, 1, 3])
x_shape = tf.shape(x) x_shape = shape_list(x)
new_x_shape = x_shape[:-2] + (x_shape[-2] * x_shape[-1],) new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
return tf.reshape(x, new_x_shape) return tf.reshape(x, new_x_shape)
@tf.function @tf.function
def split_heads(self, x): def split_heads(self, x):
x_shape = tf.shape(x) x_shape = shape_list(x)
new_x_shape = x_shape[:-1] + (self.n_head, x_shape[-1] // self.n_head) new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
x = tf.reshape(x, new_x_shape) x = tf.reshape(x, new_x_shape)
return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features)
@tf.function @tf.function
def call(self, inputs, training=False): def call(self, inputs, training=False):
x, layer_past, head_mask = inputs x, layer_past, attention_mask, head_mask = inputs
x = self.c_attn(x) x = self.c_attn(x)
query, key, value = tf.split(x, 3, axis=2) query, key, value = tf.split(x, 3, axis=2)
...@@ -205,7 +209,7 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -205,7 +209,7 @@ class TFAttention(tf.keras.layers.Layer):
value = tf.concat([past_value, value], axis=-2) value = tf.concat([past_value, value], axis=-2)
present = tf.stack([key, value], axis=1) present = tf.stack([key, value], axis=1)
attn_outputs = self._attn(query, key, value, head_mask, training=training) attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
a = attn_outputs[0] a = attn_outputs[0]
a = self.merge_heads(a) a = self.merge_heads(a)
...@@ -217,7 +221,7 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -217,7 +221,7 @@ class TFAttention(tf.keras.layers.Layer):
return outputs # a, present, (attentions) return outputs # a, present, (attentions)
class TFMLP(nn.Module): class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs): def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs) super(TFMLP, self).__init__(**kwargs)
nx = config.n_embd nx = config.n_embd
...@@ -245,15 +249,16 @@ class TFBlock(tf.keras.layers.Layer): ...@@ -245,15 +249,16 @@ class TFBlock(tf.keras.layers.Layer):
self.mlp = TFMLP(4 * nx, config, name='mlp') self.mlp = TFMLP(4 * nx, config, name='mlp')
@tf.function @tf.function
def call(self, x, layer_past=None, head_mask=None, training=False): def call(self, inputs, training=False):
output_attn = self.attn(self.ln_1(x), x, layer_past, attention_mask, head_mask = inputs
layer_past=layer_past,
head_mask=head_mask,
training=training)
a = output_attn[0] # output_attn: a, present, (attentions)
a = self.ln_1(x)
output_attn = self.attn([a, layer_past, attention_mask, head_mask], training=training)
a = output_attn[0] # output_attn: a, present, (attentions)
x = x + a x = x + a
m = self.mlp(self.ln_2(x), training=training)
m = self.ln_2(x)
m = self.mlp(m, training=training)
x = x + m x = x + m
outputs = [x] + output_attn[1:] outputs = [x] + output_attn[1:]
...@@ -274,13 +279,13 @@ class TFGPT2Embeddings(tf.keras.layers.Layer): ...@@ -274,13 +279,13 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
""" """
self.weight = self.add_weight( self.weight = self.add_weight(
"weight", "weight",
shape=[self.vocab_size, self.n_embed], shape=[self.vocab_size, self.hidden_size],
initializer=tf.random_normal_initializer( initializer=tf.random_normal_initializer(
mean=0., stddev=self.n_embed**-0.5)) mean=0., stddev=self.hidden_size**-0.5))
super(TFBertEmbeddings, self).build(input_shape) super(TFGPT2Embeddings, self).build(input_shape)
@tf.function @tf.function
def call(self, inputs, mode="embedding", training=False): def call(self, inputs, mode="embedding"):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
Args: Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
...@@ -296,7 +301,7 @@ class TFGPT2Embeddings(tf.keras.layers.Layer): ...@@ -296,7 +301,7 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
""" """
if mode == "embedding": if mode == "embedding":
return self._embedding(inputs, training=training) return self._embedding(inputs)
elif mode == "linear": elif mode == "linear":
return self._linear(inputs) return self._linear(inputs)
else: else:
...@@ -313,10 +318,10 @@ class TFGPT2Embeddings(tf.keras.layers.Layer): ...@@ -313,10 +318,10 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. float32 tensor with shape [batch_size, length, vocab_size].
""" """
batch_size = tf.shape(inputs)[0] batch_size = shape_list(inputs)[0]
length = tf.shape(inputs)[1] length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.n_embed]) x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.weight, transpose_b=True) logits = tf.matmul(x, self.weight, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size]) return tf.reshape(logits, [batch_size, length, self.vocab_size])
...@@ -326,13 +331,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -326,13 +331,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs) super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.wte = TFGPT2Embeddings(config, name='wte') self.wte = TFGPT2Embeddings(config, name='wte')
self.wpe = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='wpe') self.wpe = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='wpe')
self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx, config, scale=Truename='h_{}'.format(i)) for i in range(config.n_layer)] self.h = [TFBlock(config.n_ctx, config, scale=True, name='h_{}'.format(i)) for i in range(config.n_layer)]
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f') self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
...@@ -346,20 +352,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -346,20 +352,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
@tf.function @tf.function
def call(self, inputs, training=False): def call(self, inputs, training=False):
input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
if not isinstance(inputs, (dict, tuple, list)): if not isinstance(inputs, (dict, tuple, list)):
input_ids = inputs input_ids = inputs
attention_mask, head_mask, position_ids, token_type_ids = None, None, None, None past, attention_mask, token_type_ids, position_ids, head_mask = None, None, None, None, None
elif isinstance(inputs, (tuple, list)): elif isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else None past = inputs[1] if len(inputs) > 1 else None
token_type_ids = inputs[2] if len(inputs) > 2 else None attention_mask = inputs[2] if len(inputs) > 2 else None
position_ids = inputs[3] if len(inputs) > 3 else None token_type_ids = inputs[3] if len(inputs) > 3 else None
head_mask = inputs[4] if len(inputs) > 4 else None position_ids = inputs[4] if len(inputs) > 4 else None
assert len(inputs) <= 5, "Too many inputs." head_mask = inputs[5] if len(inputs) > 5 else None
assert len(inputs) <= 6, "Too many inputs."
else: else:
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
past = inputs.get('past', None)
attention_mask = inputs.get('attention_mask', None) attention_mask = inputs.get('attention_mask', None)
token_type_ids = inputs.get('token_type_ids', None) token_type_ids = inputs.get('token_type_ids', None)
position_ids = inputs.get('position_ids', None) position_ids = inputs.get('position_ids', None)
...@@ -370,49 +376,66 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -370,49 +376,66 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
past_length = 0 past_length = 0
past = [None] * len(self.h) past = [None] * len(self.h)
else: else:
past_length = past[0][0].size(-2) past_length = shape_list(past[0][0])[-2]
if position_ids is None: if position_ids is None:
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device) position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if attention_mask is not None:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask = tf.cast(attention_mask, tf.float32)
attention_mask = (1.0 - attention_mask) * -10000.0
else:
attention_mask = None
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N # attention_probs has shape bsz x n_heads x N x N
# head_mask has shape n_layer x batch x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
if head_mask is not None: # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if head_mask.dim() == 1: if not head_mask is None:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) raise NotImplementedError
head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
else: else:
head_mask = [None] * self.config.n_layer head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
input_shape = input_ids.size() input_shape = shape_list(input_ids)
input_ids = input_ids.view(-1, input_ids.size(-1)) input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
position_ids = position_ids.view(-1, position_ids.size(-1)) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
inputs_embeds = self.wte(input_ids) inputs_embeds = self.wte(input_ids, mode='embedding')
position_embeds = self.wpe(position_ids) position_embeds = self.wpe(position_ids)
if token_type_ids is not None: if token_type_ids is not None:
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
token_type_embeds = self.wte(token_type_ids) token_type_embeds = self.wte(token_type_ids, mode='embedding')
else: else:
token_type_embeds = 0 token_type_embeds = 0
hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = inputs_embeds + position_embeds + token_type_embeds
hidden_states = self.drop(hidden_states) if training:
hidden_states = self.drop(hidden_states)
output_shape = input_shape + (hidden_states.size(-1),) output_shape = input_shape + [shape_list(hidden_states)[-1]]
presents = () presents = ()
all_attentions = [] all_attentions = []
all_hidden_states = () all_hidden_states = ()
for i, (block, layer_past) in enumerate(zip(self.h, past)): for i, (block, layer_past) in enumerate(zip(self.h, past)):
if self.output_hidden_states: if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
outputs = block([hidden_states, layer_past, attention_mask, head_mask[i]], training=training)
outputs = block(hidden_states, layer_past, head_mask[i])
hidden_states, present = outputs[:2] hidden_states, present = outputs[:2]
presents = presents + (present,) presents = presents + (present,)
...@@ -421,7 +444,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -421,7 +444,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
hidden_states = self.ln_f(hidden_states) hidden_states = self.ln_f(hidden_states)
hidden_states = hidden_states.view(*output_shape) hidden_states = tf.reshape(hidden_states, output_shape)
# Add last hidden state # Add last hidden state
if self.output_hidden_states: if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
...@@ -431,18 +454,19 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -431,18 +454,19 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if self.output_attentions: if self.output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
return outputs # last hidden state, presents, (all hidden_states), (attentions) return outputs # last hidden state, presents, (all hidden_states), (attentions)
class TFGPT2PreTrainedModel(TFPreTrainedModel): class TFGPT2PreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """ An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models. a simple interface for dowloading and loading pretrained models.
""" """
config_class = GPT2Config config_class = GPT2Config
pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = load_tf_weights_in_gpt2 load_pt_weights = load_gpt2_pt_weights_in_tf
base_model_prefix = "transformer" base_model_prefix = "transformer"
...@@ -487,17 +511,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: ...@@ -487,17 +511,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`. Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**: **past**:
list of ``torch.FloatTensor`` (one for each layer): list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding. (see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
...@@ -526,7 +554,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -526,7 +554,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
Examples:: Examples::
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
...@@ -534,149 +562,19 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -534,149 +562,19 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
""" """
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFGPT2Model, self).__init__(config, *inputs, **kwargs) super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.transformer = TFGPT2MainLayer(config, name='transformer')
self.output_attentions = config.output_attentions
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd
self.wpe = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='wpe')
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx, config, scale=Truename='h_{}'.format(i)) for i in range(config.n_layer)]
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
self.init_weights()
def build(self, input_shape):
"""Build shared word embedding layer
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
with tf.name_scope("wte"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.wte = self.add_weight(
"weight",
shape=[self.vocab_size, self.n_embed],
initializer=tf.random_normal_initializer(
mean=0., stddev=self.n_embed**-0.5))
super(TFGPT2Model, self).build(input_shape)
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise NotImplementedError
@tf.function @tf.function
def call(self, inputs, training=False): def call(self, inputs, training=False):
input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None): outputs = self.transformer(inputs, training=training)
return outputs
if not isinstance(inputs, (dict, tuple, list)):
input_ids = inputs
attention_mask, head_mask, position_ids, token_type_ids = None, None, None, None
elif isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else None
token_type_ids = inputs[2] if len(inputs) > 2 else None
position_ids = inputs[3] if len(inputs) > 3 else None
head_mask = inputs[4] if len(inputs) > 4 else None
assert len(inputs) <= 5, "Too many inputs."
else:
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', None)
token_type_ids = inputs.get('token_type_ids', None)
position_ids = inputs.get('position_ids', None)
head_mask = inputs.get('head_mask', None)
assert len(inputs) <= 5, "Too many inputs."
if past is None:
past_length = 0
past = [None] * len(self.h)
else:
past_length = past[0][0].size(-2)
if position_ids is None:
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# head_mask has shape n_layer x batch x n_heads x N x N
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.n_layer
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_ids.size(-1))
position_ids = position_ids.view(-1, position_ids.size(-1))
inputs_embeds = self.wte(input_ids)
position_embeds = self.wpe(position_ids)
if token_type_ids is not None:
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
token_type_embeds = self.wte(token_type_ids)
else:
token_type_embeds = 0
hidden_states = inputs_embeds + position_embeds + token_type_embeds
hidden_states = self.drop(hidden_states)
output_shape = input_shape + (hidden_states.size(-1),)
presents = ()
all_attentions = []
all_hidden_states = ()
for i, (block, layer_past) in enumerate(zip(self.h, past)):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = block(hidden_states, layer_past, head_mask[i])
hidden_states, present = outputs[:2]
presents = presents + (present,)
if self.output_attentions:
all_attentions.append(outputs[2])
hidden_states = self.ln_f(hidden_states)
hidden_states = hidden_states.view(*output_shape)
# Add last hidden state
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states, presents)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
if self.output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, presents, (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2LMHeadModel(GPT2PreTrainedModel): class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
r""" r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**: **past**:
...@@ -700,93 +598,38 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -700,93 +598,38 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
model = GPT2LMHeadModel.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids) outputs = model(input_ids)
loss, logits = outputs[:2] logits = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config, *inputs, **kwargs):
super(GPT2LMHeadModel, self).__init__(config) super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = GPT2Model(config) self.transformer = TFGPT2MainLayer(config, name='transformer')
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.init_weights()
self.tie_weights()
def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self._tie_or_clone_weights(self.lm_head,
self.transformer.wte)
def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None): @tf.function
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, def call(self, inputs, training=False):
past=past, head_mask=head_mask) transformer_outputs = self.transformer(inputs, training=training)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.transformer.wte(hidden_states, mode="linear")
outputs = (lm_logits,) + transformer_outputs[1:] outputs = (lm_logits,) + transformer_outputs[1:]
if labels is not None:
# Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) return outputs # lm_logits, presents, (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification @add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings, The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence). the classification head takes as input the input of a specified classification token index in the input sequence).
""", GPT2_START_DOCSTRING) """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2DoubleHeadsModel(GPT2PreTrainedModel): class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
r""" Inputs: r"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence. Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Multiple choice classification loss.
**lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)`` **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
...@@ -827,43 +670,52 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -827,43 +670,52 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2] lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config, *inputs, **kwargs):
super(GPT2DoubleHeadsModel, self).__init__(config) super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
self.transformer = GPT2Model(config) self.transformer = TFGPT2MainLayer(config, name='transformer')
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = TFSequenceSummary(config, name='multiple_choice_head')
self.multiple_choice_head = SequenceSummary(config)
self.init_weights() @tf.function
self.tie_weights() def call(self, inputs, training=False):
if not isinstance(inputs, (dict, tuple, list)):
def tie_weights(self): raise ValueError("Inputs should be a list or a dict with at least two elements: 'inputs_ids' and 'mc_token_ids'")
""" Make sure we are sharing the input and output embeddings. elif isinstance(inputs, (tuple, list)):
Export to TorchScript can't handle parameter sharing so we are cloning them instead. input_ids = inputs[0]
""" mc_token_ids = inputs[1]
self._tie_or_clone_weights(self.lm_head, past = inputs[2] if len(inputs) > 2 else None
self.transformer.wte) attention_mask = inputs[3] if len(inputs) > 3 else None
token_type_ids = inputs[4] if len(inputs) > 4 else None
position_ids = inputs[5] if len(inputs) > 5 else None
head_mask = inputs[6] if len(inputs) > 6 else None
assert len(inputs) <= 7, "Too many inputs."
else:
input_ids = inputs.get('input_ids')
mc_token_ids = inputs.get('mc_token_ids')
past = inputs.get('past', None)
attention_mask = inputs.get('attention_mask', None)
token_type_ids = inputs.get('token_type_ids', None)
position_ids = inputs.get('position_ids', None)
head_mask = inputs.get('head_mask', None)
assert len(inputs) <= 5, "Too many inputs."
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
outputs = self.transformer(flat_inputs, training=training)
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
position_ids=None, past=None, head_mask=None):
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
past=past, head_mask=head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.transformer.wte(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:] outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
if mc_labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
mc_labels.view(-1))
outputs = (loss,) + outputs
if lm_labels is not None:
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions) return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
...@@ -273,15 +273,117 @@ class TFConv1D(tf.keras.layers.Layer): ...@@ -273,15 +273,117 @@ class TFConv1D(tf.keras.layers.Layer):
mean=0., stddev=0.02)) mean=0., stddev=0.02))
self.bias = self.add_weight( self.bias = self.add_weight(
"bias", "bias",
shape=[self.nx, self.nf], shape=[1, self.nf],
initializer=tf.zeros_initializer()) initializer=tf.zeros_initializer())
@tf.function @tf.function
def call(self, x): def call(self, x):
size_out = tf.shape(x)[:-1] + (self.nf,) bz, sl = shape_list(x)[:2]
x = tf.reshape(x, [-1, tf.shape(x)[-1]]) x = tf.reshape(x, [-1, self.nx])
x = tf.matmul(x, self.weight) + self.bias x = tf.matmul(x, self.weight) + self.bias
x = tf.reshape(x, size_out)
x = tf.reshape(x, [bz, sl, self.nf])
return x return x
class TFSequenceSummary(tf.keras.layers.Layer):
r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
Args of the config class:
summary_type:
- 'last' => [default] take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj: Add a projection after the vector extraction
summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
summary_first_dropout: Add a dropout before the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
"""
def __init__(self, config, **kwargs):
super(TFSequenceSummary, self).__init__(**kwargs)
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
if self.summary_type == 'attn':
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise NotImplementedError
self.summary = tf.keras.layers.Identity(name='summary')
if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
num_classes = config.num_labels
else:
num_classes = config.hidden_size
self.summary = tf.keras.layers.Dense(num_classes, name='summary')
self.activation = None
if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
self.activation = tf.keras.layers.Tanh()
self.first_dropout = None
if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
self.last_dropout = None
if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
@tf.function
def call(self, inputs, training=False):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
cls_index: [optional] position of the classification token if summary_type == 'cls_index',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'cls_index' and cls_index is None:
we take the last token of the sequence as classification token
"""
if not isinstance(inputs, (dict, tuple, list)):
hidden_states = inputs
cls_index = None
elif isinstance(inputs, (tuple, list)):
hidden_states = inputs[0]
cls_index = inputs[1] if len(inputs) > 1 else None
assert len(inputs) <= 2, "Too many inputs."
else:
input_ids = inputs.get('input_ids')
cls_index = inputs.get('cls_index', None)
if self.summary_type == 'last':
output = hidden_states[:, -1]
elif self.summary_type == 'first':
output = hidden_states[:, 0]
elif self.summary_type == 'mean':
output = tf.mean(hidden_states, axis=1)
elif self.summary_type == 'cls_index':
if cls_index is None:
cls_index = tf.fill(tf.shape(hidden_states[..., :1, :]), hidden_states.shape[-2]-1, dtype=tf.int32)
else:
cls_index = cls_index[..., tf.newaxis, tf.newaxis]
cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
# shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
elif self.summary_type == 'attn':
raise NotImplementedError
if training and self.first_dropout is not None:
output = self.first_dropout(output)
output = self.summary(output)
if self.activation is not None:
output = self.activation(output)
if training and self.last_dropout is not None:
output = self.last_dropout(output)
return output
def shape_list(x):
"""Deal with dynamic shape in tensorflow cleanly."""
static = x.shape.as_list()
dynamic = tf.shape(x)
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
...@@ -44,6 +44,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): ...@@ -44,6 +44,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
seq_length=7, seq_length=7,
is_training=True, is_training=True,
use_token_type_ids=True, use_token_type_ids=True,
use_input_mask=True,
use_labels=True, use_labels=True,
vocab_size=99, vocab_size=99,
hidden_size=32, hidden_size=32,
...@@ -66,6 +67,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): ...@@ -66,6 +67,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
self.seq_length = seq_length self.seq_length = seq_length
self.is_training = is_training self.is_training = is_training
self.use_token_type_ids = use_token_type_ids self.use_token_type_ids = use_token_type_ids
self.use_input_mask = use_input_mask
self.use_labels = use_labels self.use_labels = use_labels
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
...@@ -86,6 +88,10 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): ...@@ -86,6 +88,10 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None token_type_ids = None
if self.use_token_type_ids: if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
...@@ -115,14 +121,14 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): ...@@ -115,14 +121,14 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels return config, input_ids, input_mask, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
def check_loss_output(self, result): def check_loss_output(self, result):
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["loss"].size()), list(result["loss"].size()),
[]) [])
def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args): def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = GPT2Model(config=config) model = GPT2Model(config=config)
model.eval() model.eval()
...@@ -139,7 +145,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): ...@@ -139,7 +145,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.seq_length, self.hidden_size]) [self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertEqual(len(result["presents"]), config.n_layer) self.parent.assertEqual(len(result["presents"]), config.n_layer)
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = GPT2LMHeadModel(config) model = GPT2LMHeadModel(config)
model.eval() model.eval()
...@@ -157,7 +163,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): ...@@ -157,7 +163,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
list(result["lm_logits"].size()), list(result["lm_logits"].size()),
[self.batch_size, self.seq_length, self.vocab_size]) [self.batch_size, self.seq_length, self.vocab_size])
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = GPT2DoubleHeadsModel(config) model = GPT2DoubleHeadsModel(config)
model.eval() model.eval()
...@@ -177,7 +183,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): ...@@ -177,7 +183,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs() config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs (config, input_ids, input_mask, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = { inputs_dict = {
'input_ids': input_ids, 'input_ids': input_ids,
'token_type_ids': token_type_ids, 'token_type_ids': token_type_ids,
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
import pytest
import sys
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from pytorch_transformers import GPT2Config, is_tf_available
try:
import tensorflow as tf
from pytorch_transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
TFGPT2DoubleHeadsModel,
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
except ImportError:
pytestmark = pytest.mark.skip("Require TensorFlow")
class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
TFGPT2DoubleHeadsModel) if is_tf_available() else ()
class TFGPT2ModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_token_type_ids=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_token_type_ids = use_token_type_ids
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = GPT2Config(
vocab_size_or_config_json_file=self.vocab_size,
n_embd=self.hidden_size,
n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads,
# intermediate_size=self.intermediate_size,
# hidden_act=self.hidden_act,
# hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions=self.max_position_embeddings,
n_ctx=self.max_position_embeddings
# type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range
)
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
return config, input_ids, input_mask, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFGPT2Model(config=config)
inputs = {'input_ids': input_ids,
'attention_mask': input_mask,
'token_type_ids': token_type_ids}
sequence_output = model(inputs)[0]
inputs = [input_ids, None, input_mask] # None is the input for 'past'
sequence_output = model(inputs)[0]
sequence_output = model(input_ids)[0]
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual(
list(result["sequence_output"].shape),
[self.batch_size, self.seq_length, self.hidden_size])
def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFGPT2LMHeadModel(config=config)
inputs = {'input_ids': input_ids,
'attention_mask': input_mask,
'token_type_ids': token_type_ids}
prediction_scores = model(inputs)[0]
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape),
[self.batch_size, self.seq_length, self.vocab_size])
def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
pass
# model = TFGPT2DoubleHeadsModel(config=config)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# seq_relationship_score, = model(inputs)[0]
# result = {
# "seq_relationship_score": seq_relationship_score.numpy(),
# }
# self.parent.assertListEqual(
# list(result["seq_relationship_score"].shape),
# [self.batch_size, 2])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, input_mask, head_mask, token_type_ids,
sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = TFGPT2ModelTest.TFGPT2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=GPT2Config, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_gpt2_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
def test_gpt2_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
def test_gpt2_double_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_transformers_test/"
for model_name in list(TF_gpt2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment