WIP GPT2

72863735 · thomwolf · 34f28b2a · 72863735 · 72863735 · 72863735
Commit 72863735 authored Sep 09, 2019 by thomwolf
5 changed files
--- a/pytorch_transformers/modeling_tf_bert.py
+++ b/pytorch_transformers/modeling_tf_bert.py
@@ -704,20 +704,7 @@ class TFBertModel(TFBertPreTrainedModel):
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class TFBertForPreTraining(TFBertPreTrainedModel):
    r"""
-        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
@@ -762,15 +749,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class TFBertForMaskedLM(TFBertPreTrainedModel):
    r"""
-        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
@@ -786,8 +765,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
+        outputs = model(input_ids)
-        loss, prediction_scores = outputs[:2]
+        prediction_scores = outputs[:2]
    """
    def __init__(self, config, *inputs, **kwargs):
@@ -811,12 +790,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
    r"""
-        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Next sequence prediction (classification) loss.
@@ -862,15 +835,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class TFBertForSequenceClassification(TFBertPreTrainedModel):
    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
@@ -886,8 +851,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
-        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
    """
@@ -905,7 +869,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
+        if training:
+            pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -915,53 +880,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
 @add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING)
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class TFBertForMultipleChoice(TFBertPreTrainedModel):
    r"""
-    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-            (a) For sequence pairs:
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
-            (b) For single sequences:
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-                ``token_type_ids:   0   0   0   0  0     0   0``
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss.
        **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above).
            Classification scores (before SoftMax).
@@ -979,8 +901,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
-        outputs = model(input_ids, labels=labels)
        loss, classification_scores = outputs[:2]
    """
@@ -1025,7 +946,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
+        if training:
+            pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = tf.reshape(logits, (-1, num_choices))
@@ -1039,13 +961,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class TFBertForTokenClassification(TFBertPreTrainedModel):
    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss.
        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
            Classification scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
@@ -1061,8 +977,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
-        outputs = model(input_ids, labels=labels)
        loss, scores = outputs[:2]
    """
@@ -1080,7 +995,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
+        if training:
+            sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -1093,18 +1009,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class TFBertForQuestionAnswering(TFBertPreTrainedModel):
    r"""
-        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
            Span-start scores (before SoftMax).
        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``

--- a/pytorch_transformers/modeling_tf_gpt2.py
+++ b/pytorch_transformers/modeling_tf_gpt2.py
@@ -28,13 +28,13 @@ from io import open
 import numpy as np
 import tensorflow as tf
-from .modeling_tf_utils import TFPreTrainedModel, TFConv1D
+from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
-GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
+TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
@@ -139,7 +139,7 @@ class TFAttention(tf.keras.layers.Layer):
    @staticmethod
    @tf.function
-    def attention_mask(nd, ns, dtype):
+    def causal_attention_mask(nd, ns, dtype):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
@@ -150,20 +150,24 @@ class TFAttention(tf.keras.layers.Layer):
    @tf.function
    def _attn(self, inputs, training=False):
-        q, k, v, head_mask = inputs
+        q, k, v, attention_mask, head_mask = inputs
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        if self.scale:
-            n_state = shape_list(v)[-1]
+            dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
-            w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
+            w = w / tf.math.sqrt(dk)
        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
        _, _, nd, ns = shape_list(w)
-        b = self.attention_mask(nd, ns, dtype=w.dtype)
+        b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w * b - 1e4 * (1 - b)
-        w = tf.nn.softmax(w)
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
+        w = tf.nn.softmax(w, axis=-1)
        if training:
            w = self.attn_dropout(w)
@@ -179,20 +183,20 @@ class TFAttention(tf.keras.layers.Layer):
    @tf.function
    def merge_heads(self, x):
        x = tf.transpose(x, [0, 2, 1, 3])
-        x_shape = tf.shape(x)
+        x_shape = shape_list(x)
-        new_x_shape = x_shape[:-2] + (x_shape[-2] * x_shape[-1],)
+        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
        return tf.reshape(x, new_x_shape)
    @tf.function
    def split_heads(self, x):
-        x_shape = tf.shape(x)
+        x_shape = shape_list(x)
-        new_x_shape = x_shape[:-1] + (self.n_head, x_shape[-1] // self.n_head)
+        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
        x = tf.reshape(x, new_x_shape)
        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
    @tf.function
    def call(self, inputs, training=False):
-        x, layer_past, head_mask = inputs
+        x, layer_past, attention_mask, head_mask = inputs
        x = self.c_attn(x)
        query, key, value = tf.split(x, 3, axis=2)
@@ -205,7 +209,7 @@ class TFAttention(tf.keras.layers.Layer):
            value = tf.concat([past_value, value], axis=-2)
        present = tf.stack([key, value], axis=1)
-        attn_outputs = self._attn(query, key, value, head_mask, training=training)
+        attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
        a = attn_outputs[0]
        a = self.merge_heads(a)
@@ -217,7 +221,7 @@ class TFAttention(tf.keras.layers.Layer):
        return outputs  # a, present, (attentions)
-class TFMLP(nn.Module):
+class TFMLP(tf.keras.layers.Layer):
    def __init__(self, n_state, config, **kwargs):
        super(TFMLP, self).__init__(**kwargs)
        nx = config.n_embd
@@ -245,15 +249,16 @@ class TFBlock(tf.keras.layers.Layer):
        self.mlp = TFMLP(4 * nx, config, name='mlp')
    @tf.function
-    def call(self, x, layer_past=None, head_mask=None, training=False):
+    def call(self, inputs, training=False):
-        output_attn = self.attn(self.ln_1(x),
+        x, layer_past, attention_mask, head_mask = inputs
-                                layer_past=layer_past,
-                                head_mask=head_mask,
-                                training=training)
-        a = output_attn[0]  # output_attn: a, present, (attentions)
+        a = self.ln_1(x)
+        output_attn = self.attn([a, layer_past, attention_mask, head_mask], training=training)
+        a = output_attn[0]  # output_attn: a, present, (attentions)
        x = x + a
-        m = self.mlp(self.ln_2(x), training=training)
+        m = self.ln_2(x)
+        m = self.mlp(m, training=training)
        x = x + m
        outputs = [x] + output_attn[1:]
@@ -274,13 +279,13 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
        """
        self.weight = self.add_weight(
            "weight",
-            shape=[self.vocab_size, self.n_embed],
+            shape=[self.vocab_size, self.hidden_size],
            initializer=tf.random_normal_initializer(
-                mean=0., stddev=self.n_embed**-0.5))
+                mean=0., stddev=self.hidden_size**-0.5))
-        super(TFBertEmbeddings, self).build(input_shape)
+        super(TFGPT2Embeddings, self).build(input_shape)
    @tf.function
-    def call(self, inputs, mode="embedding", training=False):
+    def call(self, inputs, mode="embedding"):
        """Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
@@ -296,7 +301,7 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        """
        if mode == "embedding":
-            return self._embedding(inputs, training=training)
+            return self._embedding(inputs)
        elif mode == "linear":
            return self._linear(inputs)
        else:
@@ -313,10 +318,10 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        """
-        batch_size = tf.shape(inputs)[0]
+        batch_size = shape_list(inputs)[0]
-        length = tf.shape(inputs)[1]
+        length = shape_list(inputs)[1]
-        x = tf.reshape(inputs, [-1, self.n_embed])
+        x = tf.reshape(inputs, [-1, self.hidden_size])
        logits = tf.matmul(x, self.weight, transpose_b=True)
        return tf.reshape(logits, [batch_size, length, self.vocab_size])
@@ -326,13 +331,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs)
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
+        self.num_hidden_layers = config.n_layer
        self.vocab_size = config.vocab_size
        self.n_embd = config.n_embd
        self.wte = TFGPT2Embeddings(config, name='wte')
        self.wpe = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='wpe')
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=Truename='h_{}'.format(i)) for i in range(config.n_layer)]
+        self.h = [TFBlock(config.n_ctx, config, scale=True, name='h_{}'.format(i)) for i in range(config.n_layer)]
        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
    def _resize_token_embeddings(self, new_num_tokens):
@@ -346,20 +352,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
    @tf.function
    def call(self, inputs, training=False):
-        input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
        if not isinstance(inputs, (dict, tuple, list)):
            input_ids = inputs
-            attention_mask, head_mask, position_ids, token_type_ids = None, None, None, None
+            past, attention_mask, token_type_ids, position_ids, head_mask = None, None, None, None, None
        elif isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else None
+            past = inputs[1] if len(inputs) > 1 else None
-            token_type_ids = inputs[2] if len(inputs) > 2 else None
+            attention_mask = inputs[2] if len(inputs) > 2 else None
-            position_ids = inputs[3] if len(inputs) > 3 else None
+            token_type_ids = inputs[3] if len(inputs) > 3 else None
-            head_mask = inputs[4] if len(inputs) > 4 else None
+            position_ids = inputs[4] if len(inputs) > 4 else None
-            assert len(inputs) <= 5, "Too many inputs."
+            head_mask = inputs[5] if len(inputs) > 5 else None
+            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs.get('input_ids')
+            past = inputs.get('past', None)
            attention_mask = inputs.get('attention_mask', None)
            token_type_ids = inputs.get('token_type_ids', None)
            position_ids = inputs.get('position_ids', None)
@@ -370,49 +376,66 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            past_length = 0
            past = [None] * len(self.h)
        else:
-            past_length = past[0][0].size(-2)
+            past_length = shape_list(past[0][0])[-2]
        if position_ids is None:
-            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
+            position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = tf.cast(attention_mask, tf.float32)
+            attention_mask = (1.0 - attention_mask) * -10000.0
+        else:
+            attention_mask = None
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        if head_mask is not None:
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-            if head_mask.dim() == 1:
+        if not head_mask is None:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            raise NotImplementedError
-                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
        else:
-            head_mask = [None] * self.config.n_layer
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
-        input_shape = input_ids.size()
+        input_shape = shape_list(input_ids)
-        input_ids = input_ids.view(-1, input_ids.size(-1))
+        input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
-        position_ids = position_ids.view(-1, position_ids.size(-1))
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
-        inputs_embeds = self.wte(input_ids)
+        inputs_embeds = self.wte(input_ids, mode='embedding')
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.wte(token_type_ids)
+            token_type_embeds = self.wte(token_type_ids, mode='embedding')
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states)
+        if training:
+            hidden_states = self.drop(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
        presents = ()
        all_attentions = []
        all_hidden_states = ()
        for i, (block, layer_past) in enumerate(zip(self.h, past)):
            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
+                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+            outputs = block([hidden_states, layer_past, attention_mask, head_mask[i]], training=training)
-            outputs = block(hidden_states, layer_past, head_mask[i])
            hidden_states, present = outputs[:2]
            presents = presents + (present,)
@@ -421,7 +444,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(*output_shape)
+        hidden_states = tf.reshape(hidden_states, output_shape)
        # Add last hidden state
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
@@ -431,18 +454,19 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            # let the number of heads free (-1) so we can extract attention even after head pruning
-            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
+            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
-            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
+            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
            outputs = outputs + (all_attentions,)
        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
 class TFGPT2PreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    config_class = GPT2Config
-    pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_gpt2
+    load_pt_weights = load_gpt2_pt_weights_in_tf
    base_model_prefix = "transformer"
@@ -487,17 +511,21 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **past**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -526,7 +554,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
    Examples::
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2Model.from_pretrained('gpt2')
+        model = GPT2Model.from_pretrained('gpt2')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
@@ -534,149 +562,19 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
-        self.output_hidden_states = config.output_hidden_states
+        self.transformer = TFGPT2MainLayer(config, name='transformer')
-        self.output_attentions = config.output_attentions
-        self.vocab_size = config.vocab_size
-        self.n_embd = config.n_embd
-        self.wpe = tf.keras.layers.Embedding(config.n_positions, config.n_embd, name='wpe')
-        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=Truename='h_{}'.format(i)) for i in range(config.n_layer)]
-        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
-        self.init_weights()
-    def build(self, input_shape):
-        """Build shared word embedding layer
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        with tf.name_scope("wte"):
-            # Create and initialize weights. The random normal initializer was chosen
-            # arbitrarily, and works well.
-            self.wte = self.add_weight(
-                "weight",
-                shape=[self.vocab_size, self.n_embed],
-                initializer=tf.random_normal_initializer(
-                    mean=0., stddev=self.n_embed**-0.5))
-        super(TFGPT2Model, self).build(input_shape)
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        raise NotImplementedError
    @tf.function
    def call(self, inputs, training=False):
-        input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
+        outputs = self.transformer(inputs, training=training)
+        return outputs
-        if not isinstance(inputs, (dict, tuple, list)):
-            input_ids = inputs
-            attention_mask, head_mask, position_ids, token_type_ids = None, None, None, None
-        elif isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else None
-            token_type_ids = inputs[2] if len(inputs) > 2 else None
-            position_ids = inputs[3] if len(inputs) > 3 else None
-            head_mask = inputs[4] if len(inputs) > 4 else None
-            assert len(inputs) <= 5, "Too many inputs."
-        else:
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', None)
-            token_type_ids = inputs.get('token_type_ids', None)
-            position_ids = inputs.get('position_ids', None)
-            head_mask = inputs.get('head_mask', None)
-            assert len(inputs) <= 5, "Too many inputs."
-        if past is None:
-            past_length = 0
-            past = [None] * len(self.h)
-        else:
-            past_length = past[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.n_layer
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        position_ids = position_ids.view(-1, position_ids.size(-1))
-        inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-            token_type_embeds = self.wte(token_type_ids)
-        else:
-            token_type_embeds = 0
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
-        presents = ()
-        all_attentions = []
-        all_hidden_states = ()
-        for i, (block, layer_past) in enumerate(zip(self.h, past)):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-            outputs = block(hidden_states, layer_past, head_mask[i])
-            hidden_states, present = outputs[:2]
-            presents = presents + (present,)
-            if self.output_attentions:
-                all_attentions.append(outputs[2])
-        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(*output_shape)
-        # Add last hidden state
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-        outputs = (hidden_states, presents)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            # let the number of heads free (-1) so we can extract attention even after head pruning
-            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
-            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
-            outputs = outputs + (all_attentions,)
-        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
 (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
-class GPT2LMHeadModel(GPT2PreTrainedModel):
+class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-1`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Language modeling loss.
        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **past**:
@@ -700,93 +598,38 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
+        outputs = model(input_ids)
-        loss, logits = outputs[:2]
+        logits = outputs[:2]
    """
-    def __init__(self, config):
+    def __init__(self, config, *inputs, **kwargs):
-        super(GPT2LMHeadModel, self).__init__(config)
+        super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = GPT2Model(config)
+        self.transformer = TFGPT2MainLayer(config, name='transformer')
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        self.init_weights()
-        self.tie_weights()
-    def tie_weights(self):
-        """ Make sure we are sharing the input and output embeddings.
-            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        """
-        self._tie_or_clone_weights(self.lm_head,
-                                   self.transformer.wte)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
+    @tf.function
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+    def call(self, inputs, training=False):
-                                               past=past, head_mask=head_mask)
+        transformer_outputs = self.transformer(inputs, training=training)
        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
+        lm_logits = self.transformer.wte(hidden_states, mode="linear")
        outputs = (lm_logits,) + transformer_outputs[1:]
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
-            outputs = (loss,) + outputs
-        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
+        return outputs  # lm_logits, presents, (all hidden_states), (attentions)
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING)
+""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
-class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
+class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
-    r"""    Inputs:
+    r"""
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer):
-            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-1`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Language modeling loss.
-        **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Multiple choice classification loss.
        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
@@ -827,43 +670,52 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        lm_prediction_scores, mc_prediction_scores = outputs[:2]
    """
-    def __init__(self, config):
+    def __init__(self, config, *inputs, **kwargs):
-        super(GPT2DoubleHeadsModel, self).__init__(config)
+        super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = GPT2Model(config)
+        self.transformer = TFGPT2MainLayer(config, name='transformer')
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = TFSequenceSummary(config, name='multiple_choice_head')
-        self.multiple_choice_head = SequenceSummary(config)
-        self.init_weights()
+    @tf.function
-        self.tie_weights()
+    def call(self, inputs, training=False):
+        if not isinstance(inputs, (dict, tuple, list)):
-    def tie_weights(self):
+            raise ValueError("Inputs should be a list or a dict with at least two elements: 'inputs_ids' and 'mc_token_ids'")
-        """ Make sure we are sharing the input and output embeddings.
+        elif isinstance(inputs, (tuple, list)):
-            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+            input_ids = inputs[0]
-        """
+            mc_token_ids = inputs[1]
-        self._tie_or_clone_weights(self.lm_head,
+            past = inputs[2] if len(inputs) > 2 else None
-                                   self.transformer.wte)
+            attention_mask = inputs[3] if len(inputs) > 3 else None
+            token_type_ids = inputs[4] if len(inputs) > 4 else None
+            position_ids = inputs[5] if len(inputs) > 5 else None
+            head_mask = inputs[6] if len(inputs) > 6 else None
+            assert len(inputs) <= 7, "Too many inputs."
+        else:
+            input_ids = inputs.get('input_ids')
+            mc_token_ids = inputs.get('mc_token_ids')
+            past = inputs.get('past', None)
+            attention_mask = inputs.get('attention_mask', None)
+            token_type_ids = inputs.get('token_type_ids', None)
+            position_ids = inputs.get('position_ids', None)
+            head_mask = inputs.get('head_mask', None)
+            assert len(inputs) <= 5, "Too many inputs."
+        num_choices = shape_list(input_ids)[1]
+        seq_length = shape_list(input_ids)[2]
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
+        outputs = self.transformer(flat_inputs, training=training)
-    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
-                position_ids=None, past=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                                               past=past, head_mask=head_mask)
        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
+        lm_logits = self.transformer.wte(hidden_states, mode="linear")
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+        mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
-                            mc_labels.view(-1))
-            outputs = (loss,) + outputs
-        if lm_labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
-            outputs = (loss,) + outputs
        return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
--- a/pytorch_transformers/modeling_tf_utils.py
+++ b/pytorch_transformers/modeling_tf_utils.py
@@ -273,15 +273,117 @@ class TFConv1D(tf.keras.layers.Layer):
                mean=0., stddev=0.02))
        self.bias = self.add_weight(
            "bias",
-            shape=[self.nx, self.nf],
+            shape=[1, self.nf],
            initializer=tf.zeros_initializer())
    @tf.function
    def call(self, x):
-        size_out = tf.shape(x)[:-1] + (self.nf,)
+        bz, sl = shape_list(x)[:2]
-        x = tf.reshape(x, [-1, tf.shape(x)[-1]])
+        x = tf.reshape(x, [-1, self.nx])
        x = tf.matmul(x, self.weight) + self.bias
-        x = tf.reshape(x, size_out)
+        x = tf.reshape(x, [bz, sl, self.nf])
        return x
+class TFSequenceSummary(tf.keras.layers.Layer):
+    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
+        Args of the config class:
+            summary_type:
+                - 'last' => [default] take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj: Add a projection after the vector extraction
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
+    """
+    def __init__(self, config, **kwargs):
+        super(TFSequenceSummary, self).__init__(**kwargs)
+        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
+        if self.summary_type == 'attn':
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+        self.summary = tf.keras.layers.Identity(name='summary')
+        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
+            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = tf.keras.layers.Dense(num_classes, name='summary')
+        self.activation = None
+        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+            self.activation = tf.keras.layers.Tanh()
+        self.first_dropout = None
+        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+            self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
+        self.last_dropout = None
+        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+            self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
+    @tf.function
+    def call(self, inputs, training=False):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
+                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+                if summary_type == 'cls_index' and cls_index is None:
+                    we take the last token of the sequence as classification token
+        """
+        if not isinstance(inputs, (dict, tuple, list)):
+            hidden_states = inputs
+            cls_index = None
+        elif isinstance(inputs, (tuple, list)):
+            hidden_states = inputs[0]
+            cls_index = inputs[1] if len(inputs) > 1 else None
+            assert len(inputs) <= 2, "Too many inputs."
+        else:
+            input_ids = inputs.get('input_ids')
+            cls_index = inputs.get('cls_index', None)
+        if self.summary_type == 'last':
+            output = hidden_states[:, -1]
+        elif self.summary_type == 'first':
+            output = hidden_states[:, 0]
+        elif self.summary_type == 'mean':
+            output = tf.mean(hidden_states, axis=1)
+        elif self.summary_type == 'cls_index':
+            if cls_index is None:
+                cls_index = tf.fill(tf.shape(hidden_states[..., :1, :]), hidden_states.shape[-2]-1, dtype=tf.int32)
+            else:
+                cls_index = cls_index[..., tf.newaxis, tf.newaxis]
+                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
+        elif self.summary_type == 'attn':
+            raise NotImplementedError
+        if training and self.first_dropout is not None:
+            output = self.first_dropout(output)
+        output = self.summary(output)
+        if self.activation is not None:
+            output = self.activation(output)
+        if training and self.last_dropout is not None:
+            output = self.last_dropout(output)
+        return output
+def shape_list(x):
+    """Deal with dynamic shape in tensorflow cleanly."""
+    static = x.shape.as_list()
+    dynamic = tf.shape(x)
+    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -44,6 +44,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                     seq_length=7,
                     is_training=True,
                     use_token_type_ids=True,
+                     use_input_mask=True,
                     use_labels=True,
                     vocab_size=99,
                     hidden_size=32,
@@ -66,6 +67,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
            self.seq_length = seq_length
            self.is_training = is_training
            self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
            self.use_labels = use_labels
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
@@ -86,6 +88,10 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
@@ -115,14 +121,14 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+            return config, input_ids, input_mask, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
            self.parent.assertListEqual(
                list(result["loss"].size()),
                [])
-        def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2Model(config=config)
            model.eval()
@@ -139,7 +145,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertEqual(len(result["presents"]), config.n_layer)
-        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2LMHeadModel(config)
            model.eval()
@@ -157,7 +163,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                list(result["lm_logits"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])
-        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2DoubleHeadsModel(config)
            model.eval()
@@ -177,7 +183,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (config, input_ids, input_mask, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
            inputs_dict = {
                'input_ids': input_ids,
                'token_type_ids': token_type_ids,

--- a/pytorch_transformers/tests/modeling_tf_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_tf_gpt2_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from pytorch_transformers import GPT2Config, is_tf_available
+try:
+    import tensorflow as tf
+    from pytorch_transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
+                                                       TFGPT2DoubleHeadsModel,
+                                                       TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+except ImportError:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
+                         TFGPT2DoubleHeadsModel) if is_tf_available() else ()
+    class TFGPT2ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = GPT2Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+            return config, input_ids, input_mask, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2Model(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output = model(inputs)[0]
+            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
+            sequence_output = model(inputs)[0]
+            sequence_output = model(input_ids)[0]
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2LMHeadModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores = model(inputs)[0]
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            pass
+            # model = TFGPT2DoubleHeadsModel(config=config)
+            # inputs = {'input_ids': input_ids,
+            #           'attention_mask': input_mask,
+            #           'token_type_ids': token_type_ids}
+            # seq_relationship_score, = model(inputs)[0]
+            # result = {
+            #     "seq_relationship_score": seq_relationship_score.numpy(),
+            # }
+            # self.parent.assertListEqual(
+            #     list(result["seq_relationship_score"].shape),
+            #     [self.batch_size, 2])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, head_mask, token_type_ids,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFGPT2ModelTest.TFGPT2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPT2Config, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_gpt2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
+    def test_gpt2_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
+    def test_gpt2_double_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(TF_gpt2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()