update doc for XLM and XLNet

44c985fa · thomwolf · 0201d860 · 44c985fa · 44c985fa · 44c985fa
Commit 44c985fa authored Jul 15, 2019 by thomwolf
7 changed files
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -611,11 +611,11 @@ BERT_INPUTS_DOCSTRING = r"""
            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@@ -714,7 +714,7 @@ class BertModel(BertPreTrainedModel):
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-@add_start_docstrings("""Bert Model transformer BERT model with two heads on top as done during the pre-training:
+@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForPreTraining(BertPreTrainedModel):
@@ -791,7 +791,7 @@ class BertForPreTraining(BertPreTrainedModel):
        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-@add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """,
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
    r"""
@@ -856,7 +856,7 @@ class BertForMaskedLM(BertPreTrainedModel):
        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
-@add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """,
+@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForNextSentencePrediction(BertPreTrainedModel):
    r"""
@@ -913,7 +913,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
-@add_start_docstrings("""Bert Model transformer BERT model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. """,
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -981,7 +981,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
        return outputs  # (loss), logits, (hidden_states), (attentions)
-@add_start_docstrings("""Bert Model transformer BERT model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
    BERT_START_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
@@ -1016,11 +1016,11 @@ class BertForMultipleChoice(BertPreTrainedModel):
        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the multiple choice classification loss.
@@ -1087,7 +1087,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
-@add_start_docstrings("""Bert Model transformer BERT model with a token classification head on top (a linear layer on top of
+@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1154,17 +1154,17 @@ class BertForTokenClassification(BertPreTrainedModel):
        return outputs  # (loss), scores, (hidden_states), (attentions)
-@add_start_docstrings("""Bert Model transformer BERT model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForQuestionAnswering(BertPreTrainedModel):
    r"""
        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Position (index) of the start of the labelled span for computing the token classification loss.
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Position (index) of the end of the labelled span for computing the token classification loss.
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.

--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -404,11 +404,11 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            (see `past` output below). Can be used to speed up sequential decoding.
        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@@ -541,7 +541,7 @@ class GPT2Model(GPT2PreTrainedModel):
 (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
    r"""
-        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
@@ -549,7 +549,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
            computed for labels in ``[0, ..., config.vocab_size]``
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Language modeling loss.
        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -571,7 +571,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        >>> model = GPT2LMHeadModel(config)
        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, lm_labels=input_ids)
+        >>> outputs = model(input_ids, labels=input_ids)
        >>> loss, logits = outputs[:2]
    """
@@ -590,17 +590,17 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        self._tie_or_clone_weights(self.lm_head,
                                   self.transformer.wte)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)
        outputs = (lm_logits,) + transformer_outputs[1:]
-        if lm_labels is not None:
+        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
@@ -639,11 +639,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            (see `past` output below). Can be used to speed up sequential decoding.
        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Labels for language modeling.

--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -414,11 +414,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@@ -536,7 +536,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    r"""
-        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
@@ -544,7 +544,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
            computed for labels in ``[0, ..., config.vocab_size]``
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Language modeling loss.
        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -562,7 +562,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        >>> model = OpenAIGPTLMHeadModel(config)
        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, lm_labels=input_ids)
+        >>> outputs = model(input_ids, labels=input_ids)
        >>> loss, logits = outputs[:2]
    """
@@ -581,16 +581,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        self._tie_or_clone_weights(self.lm_head,
                                   self.transformer.tokens_embed)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)
        outputs = (lm_logits,) + transformer_outputs[1:]
-        if lm_labels is not None:
+        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
@@ -625,11 +625,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Labels for language modeling.

--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -937,13 +937,13 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **mems**:
+        **mems**: (`optional`)
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@@ -954,7 +954,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the last layer of the model.
-        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+        **mems**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
@@ -1270,7 +1270,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
            We don't output them when the loss is computed to speedup adaptive softmax decoding.
-        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+        **mems**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.

--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -538,7 +538,6 @@ class PoolerAnswerClass(nn.Module):
 class SQuADHead(nn.Module):
    """ A SQuAD head inspired by XLNet.
-        Compute
    """
    def __init__(self, config):
        super(SQuADHead, self).__init__()

--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py