WIP reodering arguments for torchscript and TF

e25cba78 · thomwolf · 38b79b5a · e25cba78 · e25cba78 · e25cba78
Commit e25cba78 authored Sep 04, 2019 by thomwolf
10 changed files
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -596,18 +596,18 @@ BERT_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Mask to avoid performing attention on padding token indices.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
+            Indices of positions of each input sequence tokens in the position embeddings.
-            Mask values selected in ``[0, 1]``:
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -668,7 +668,7 @@ class BertModel(BertPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
@@ -771,10 +771,14 @@ class BertForPreTraining(BertPreTrainedModel):
        self._tie_or_clone_weights(self.cls.predictions.decoder,
                                   self.bert.embeddings.word_embeddings)
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                next_sentence_label=None, position_ids=None, head_mask=None):
+                masked_lm_labels=None, next_sentence_label=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
@@ -839,10 +843,14 @@ class BertForMaskedLM(BertPreTrainedModel):
        self._tie_or_clone_weights(self.cls.predictions.decoder,
                                   self.bert.embeddings.word_embeddings)
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                position_ids=None, head_mask=None):
+                masked_lm_labels=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
@@ -896,10 +904,15 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                position_ids=None, head_mask=None):
+                next_sentence_label=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
        pooled_output = outputs[1]
        seq_relationship_score = self.cls(pooled_output)
@@ -957,10 +970,15 @@ class BertForSequenceClassification(BertPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None):
+                position_ids=None, head_mask=None, labels=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
@@ -983,45 +1001,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
 @add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING)
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
    r"""
-    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-            (a) For sequence pairs:
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
-            (b) For single sequences:
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-                ``token_type_ids:   0   0   0   0  0     0   0``
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
@@ -1061,16 +1043,21 @@ class BertForMultipleChoice(BertPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None):
+                position_ids=None, head_mask=None, labels=None):
        num_choices = input_ids.shape[1]
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
-                            attention_mask=flat_attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids,
+                            head_mask=head_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
@@ -1129,10 +1116,15 @@ class BertForTokenClassification(BertPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None):
+                position_ids=None, head_mask=None, labels=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
@@ -1203,10 +1195,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                end_positions=None, position_ids=None, head_mask=None):
+                start_positions=None, end_positions=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
        sequence_output = outputs[0]
        logits = self.qa_outputs(sequence_output)

--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -585,10 +585,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
        self._tie_or_clone_weights(self.vocab_projector,
                                   self.distilbert.embeddings.word_embeddings)
-    def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None):
        dlbrt_output = self.distilbert(input_ids=input_ids,
-                                    attention_mask=attention_mask,
+                                       attention_mask=attention_mask,
-                                    head_mask=head_mask)
+                                       head_mask=head_mask)
        hidden_states = dlbrt_output[0]                              # (bs, seq_length, dim)
        prediction_logits = self.vocab_transform(hidden_states)      # (bs, seq_length, dim)
        prediction_logits = gelu(prediction_logits)                  # (bs, seq_length, dim)
@@ -649,10 +649,10 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids,  attention_mask=None, labels=None, head_mask=None):
+    def forward(self, input_ids,  attention_mask=None, head_mask=None, labels=None):
        distilbert_output = self.distilbert(input_ids=input_ids,
-                                      attention_mask=attention_mask,
+                                            attention_mask=attention_mask,
-                                      head_mask=head_mask)
+                                            head_mask=head_mask)
        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]                    # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
@@ -723,10 +723,10 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None):
        distilbert_output = self.distilbert(input_ids=input_ids,
-                                      attention_mask=attention_mask,
+                                            attention_mask=attention_mask,
-                                      head_mask=head_mask)
+                                            head_mask=head_mask)
        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
        hidden_states = self.dropout(hidden_states)                       # (bs, max_query_len, dim)

--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -257,7 +257,7 @@ class Attention(nn.Module):
        self.n_head = self.n_head - len(heads)
        self.pruned_heads = self.pruned_heads.union(heads)
-    def _attn(self, q, k, v, head_mask=None):
+    def _attn(self, q, k, v, attention_mask=None, head_mask=None):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
@@ -265,6 +265,10 @@ class Attention(nn.Module):
        b = self.bias[:, :, ns-nd:ns, :ns]
        w = w * b - 1e4 * (1 - b)
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
        w = nn.Softmax(dim=-1)(w)
        w = self.attn_dropout(w)
@@ -290,7 +294,7 @@ class Attention(nn.Module):
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-    def forward(self, x, layer_past=None, head_mask=None):
+    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
@@ -302,7 +306,7 @@ class Attention(nn.Module):
            value = torch.cat((past_value, value), dim=-2)
        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
-        attn_outputs = self._attn(query, key, value, head_mask)
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
        a = attn_outputs[0]
        a = self.merge_heads(a)
@@ -337,8 +341,11 @@ class Block(nn.Module):
        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
        self.mlp = MLP(4 * nx, config)
-    def forward(self, x, layer_past=None, head_mask=None):
+    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
-        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
+        output_attn = self.attn(self.ln_1(x),
+                                layer_past=layer_past,
+                                attention_mask=attention_mask,
+                                head_mask=head_mask)
        a = output_attn[0]  # output_attn: a, present, (attentions)
        x = x + a
@@ -404,17 +411,21 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **past**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -473,7 +484,7 @@ class GPT2Model(GPT2PreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
+    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
@@ -483,6 +494,23 @@ class GPT2Model(GPT2PreTrainedModel):
            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        # Attention mask.
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
@@ -520,7 +548,11 @@ class GPT2Model(GPT2PreTrainedModel):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-            outputs = block(hidden_states, layer_past, head_mask[i])
+            outputs = block(hidden_states,
+                            past=layer_past,
+                            attention_mask=attention_mask,
+                            head_mask=head_mask[i])
            hidden_states, present = outputs[:2]
            presents = presents + (present,)
@@ -601,9 +633,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        self._tie_or_clone_weights(self.lm_head,
                                   self.transformer.wte)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
+    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                labels=None):
-                                               past=past, head_mask=head_mask)
+        transformer_outputs = self.transformer(input_ids,
+                                               past=past,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               head_mask=head_mask)
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)
@@ -626,33 +663,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING)
+""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    r"""    Inputs:
+    r"""
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer):
-            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
@@ -725,10 +741,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        self._tie_or_clone_weights(self.lm_head,
                                   self.transformer.wte)
-    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                position_ids=None, past=None, head_mask=None):
+                mc_token_ids=None, lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+        transformer_outputs = self.transformer(input_ids,
-                                               past=past, head_mask=head_mask)
+                                               past=past,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               head_mask=head_mask)
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)

--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -270,7 +270,7 @@ class Attention(nn.Module):
        self.n_head = self.n_head - len(heads)
        self.pruned_heads = self.pruned_heads.union(heads)
-    def _attn(self, q, k, v, head_mask=None):
+    def _attn(self, q, k, v, attention_mask=None, head_mask=None):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
@@ -279,6 +279,10 @@ class Attention(nn.Module):
        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
        w = w * b + -1e9 * (1 - b)
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
        w = nn.Softmax(dim=-1)(w)
        w = self.attn_dropout(w)
@@ -304,14 +308,14 @@ class Attention(nn.Module):
        else:
            return x.permute(0, 2, 1, 3)
-    def forward(self, x, head_mask=None):
+    def forward(self, x, attention_mask=None, head_mask=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)
-        attn_outputs = self._attn(query, key, value, head_mask)
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
        a = attn_outputs[0]
        a = self.merge_heads(a)
@@ -346,8 +350,8 @@ class Block(nn.Module):
        self.mlp = MLP(4 * nx, config)
        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-    def forward(self, x, head_mask=None):
+    def forward(self, x, attention_mask=None, head_mask=None):
-        attn_outputs = self.attn(x, head_mask=head_mask)
+        attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask)
        a = attn_outputs[0]
        n = self.ln_1(x + a)
@@ -410,13 +414,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Mask to avoid performing attention on padding token indices.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
            The embeddings from these tokens will be summed with the respective token embeddings.
            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -470,7 +478,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        if position_ids is None:
            # This was used when we had a single embedding matrice from position and token embeddings
            # start = self.config.vocab_size + self.config.n_special
@@ -479,6 +487,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        # Attention mask.
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
@@ -515,7 +540,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-            outputs = block(hidden_states, head_mask[i])
+            outputs = block(hidden_states, attention_mask, head_mask[i])
            hidden_states = outputs[0]
            if self.output_attentions:
                all_attentions = all_attentions + (outputs[1],)
@@ -580,8 +605,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        self._tie_or_clone_weights(self.lm_head,
                                   self.transformer.tokens_embed)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
                                               head_mask=head_mask)
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)
@@ -604,29 +633,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING)
+""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    r"""    Inputs:
+    r"""
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
@@ -687,9 +699,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        self._tie_or_clone_weights(self.lm_head,
                                   self.transformer.tokens_embed)
-    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                position_ids=None, head_mask=None):
+                lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
                                               head_mask=head_mask)
        hidden_states = transformer_outputs[0]

--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -61,7 +61,9 @@ class RobertaEmbeddings(BertEmbeddings):
            # cf. fairseq's `utils.make_positions`
            position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
+        return super(RobertaEmbeddings, self).forward(input_ids,
+                                                      token_type_ids=token_type_ids,
+                                                      position_ids=position_ids)
 class RobertaConfig(BertConfig):
@@ -116,13 +118,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1[``.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional segment token indices to indicate first and second portions of the inputs.
+            This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
+            during finetuning.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -170,12 +179,16 @@ class RobertaModel(BertModel):
        self.embeddings = RobertaEmbeddings(config)
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        if input_ids[:, 0].sum().item() != 0:
            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
                           "This model requires special tokens in order to work. "
                           "Please specify add_special_tokens=True in your encoding.")
-        return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask)
+        return super(RobertaModel, self).forward(input_ids,
+                                                 attention_mask=attention_mask,
+                                                 token_type_ids=token_type_ids,
+                                                 position_ids=position_ids,
+                                                 head_mask=head_mask)
 @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
@@ -229,10 +242,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
        """
        self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                head_mask=None):
+                masked_lm_labels=None):
-        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+        outputs = self.roberta(input_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+                               attention_mask=attention_mask,
+                               token_type_ids=token_type_ids,
+                               position_ids=position_ids,
+                               head_mask=head_mask)
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)
@@ -313,10 +329,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                position_ids=None, head_mask=None):
+                labels=None):
-        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+        outputs = self.roberta(input_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+                               attention_mask=attention_mask,
+                               token_type_ids=token_type_ids,
+                               position_ids=position_ids,
+                               head_mask=head_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -1342,7 +1342,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
    def init_mems(self, data):
        return self.transformer.init_mems(data)
-    def forward(self, input_ids, labels=None, mems=None, head_mask=None):
+    def forward(self, input_ids, mems=None, head_mask=None, labels=None):
        bsz = input_ids.size(0)
        tgt_len = input_ids.size(1)

--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -441,23 +441,23 @@ XLM_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Mask to avoid performing attention on padding token indices.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            Mask values selected in ``[0, 1]``:
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            A parallel sequence of tokens to be used to indicate the language of each token in the input.
            Indices are languages ids which can be obtained from the language names by using two conversion mappings
            provided in the configuration of the model (only provided for multilingual models).
            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            Mask values selected in ``[0, 1]``:
+            The embeddings from these tokens will be summed with the respective token embeddings.
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
        **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Length of each sentence that can be used to avoid performing attention on padding token indices.
            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
@@ -584,8 +584,8 @@ class XLMModel(XLMPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.attentions[layer].prune_heads(heads)
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
+                lengths=None, cache=None, head_mask=None):  # removed: src_enc=None, src_len=None
        if lengths is None:
            lengths = (input_ids != self.pad_index).sum(dim=1).long()
        # mask = input_ids != self.pad_index
@@ -790,11 +790,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
        """
        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                attention_mask=None, cache=None, labels=None, head_mask=None):
+                lengths=None, cache=None, head_mask=None, labels=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+        transformer_outputs = self.transformer(input_ids,
-                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask,
-                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
        output = transformer_outputs[0]
        outputs = self.pred_layer(output, labels)
@@ -846,11 +851,16 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                attention_mask=None, cache=None, labels=None, head_mask=None):
+                lengths=None, cache=None, head_mask=None, labels=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+        transformer_outputs = self.transformer(input_ids,
-                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask,
-                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
        output = transformer_outputs[0]
        logits = self.sequence_summary(output)
@@ -924,12 +934,17 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                attention_mask=None, cache=None, start_positions=None, end_positions=None,
+                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
-                cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
+                is_impossible=None, cls_index=None, p_mask=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+        transformer_outputs = self.transformer(input_ids,
-                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask,
-                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
        output = transformer_outputs[0]

--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -647,21 +647,10 @@ XLNET_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
-            Kept for compatibility with the original code base.
-            You can only uses one of `input_mask` and `attention_mask`
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
        **mems**: (`optional`)
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
@@ -679,6 +668,17 @@ XLNET_INPUTS_DOCSTRING = r"""
            Mask to indicate the output tokens to use.
            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
            Only used during pretraining for partial prediction or for sequential decoding (generation).
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
+            Kept for compatibility with the original code base.
+            You can only uses one of `input_mask` and `attention_mask`
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -837,8 +837,8 @@ class XLNetModel(XLNetPreTrainedModel):
        pos_emb = pos_emb.to(next(self.parameters()))
        return pos_emb
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                mems=None, perm_mask=None, target_mapping=None, head_mask=None):
+                token_type_ids=None, input_mask=None, head_mask=None):
        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
        # but we want a unified interface in the library with the batch size on the first dimension
        # so we move here the first dimension (batch) to the end
@@ -1042,12 +1042,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        """
        self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
-                labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids,
-        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               attention_mask=attention_mask,
-                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               perm_mask=perm_mask,
+                                               target_mapping=target_mapping,
+                                               token_type_ids=token_type_ids,
+                                               input_mask=input_mask, 
                                               head_mask=head_mask)
        logits = self.lm_loss(transformer_outputs[0])
@@ -1113,12 +1116,15 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
-                labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids,
-        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               attention_mask=attention_mask,
-                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               perm_mask=perm_mask,
+                                               target_mapping=target_mapping,
+                                               token_type_ids=token_type_ids,
+                                               input_mask=input_mask, 
                                               head_mask=head_mask)
        output = transformer_outputs[0]
@@ -1215,13 +1221,16 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None,
-                start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
+                start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
-                head_mask=None):
+        transformer_outputs = self.transformer(input_ids,
-        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               attention_mask=attention_mask,
-                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               perm_mask=perm_mask,
+                                               target_mapping=target_mapping,
+                                               token_type_ids=token_type_ids,
+                                               input_mask=input_mask, 
                                               head_mask=head_mask)
        hidden_states = transformer_outputs[0]
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)

--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -126,8 +126,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertModel(config=config)
            model.eval()
-            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids, token_type_ids)
+            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids)
            result = {
@@ -143,7 +143,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForMaskedLM(config=config)
            model.eval()
-            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
@@ -156,7 +156,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForNextSentencePrediction(config=config)
            model.eval()
-            loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
            result = {
                "loss": loss,
                "seq_relationship_score": seq_relationship_score,
@@ -170,7 +170,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForPreTraining(config=config)
            model.eval()
-            loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
+            loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                                                    masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
@@ -188,7 +189,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForQuestionAnswering(config=config)
            model.eval()
-            loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
+            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                                   start_positions=sequence_labels, end_positions=sequence_labels)
            result = {
                "loss": loss,
                "start_logits": start_logits,
@@ -207,7 +209,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
            config.num_labels = self.num_labels
            model = BertForSequenceClassification(config)
            model.eval()
-            loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            result = {
                "loss": loss,
                "logits": logits,
@@ -222,7 +224,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
            config.num_labels = self.num_labels
            model = BertForTokenClassification(config=config)
            model.eval()
-            loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            result = {
                "loss": loss,
                "logits": logits,
@@ -241,9 +243,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            loss, logits = model(multiple_choice_inputs_ids,
-                         multiple_choice_token_type_ids,
+                                 attention_mask=multiple_choice_input_mask,
-                         multiple_choice_input_mask,
+                                 token_type_ids=multiple_choice_token_type_ids,
-                         choice_labels)
+                                 labels=choice_labels)
            result = {
                "loss": loss,
                "logits": logits,

--- a/pytorch_transformers/tests/modeling_distilbert_test.py
+++ b/pytorch_transformers/tests/modeling_distilbert_test.py
@@ -148,7 +148,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = DistilBertForQuestionAnswering(config=config)
            model.eval()
-            loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels)
+            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
            result = {
                "loss": loss,
                "start_logits": start_logits,
@@ -166,7 +166,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
            config.num_labels = self.num_labels
            model = DistilBertForSequenceClassification(config)
            model.eval()
-            loss, logits = model(input_ids, input_mask, sequence_labels)
+            loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
            result = {
                "loss": loss,
                "logits": logits,