"vscode:/vscode.git/clone" did not exist on "a9f1fc6c94e6d3c489d54dadbab60c612e1d7fe2"
Commit 8fe2c9d9 authored by LysandreJik's avatar LysandreJik
Browse files

Refactored Docstrings of BERT, GPT2, GPT, TransfoXL, XLM and XLNet.

parent ed6c8d37
......@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
pytorch_pretrained_bert bert \
pytorch_transformers bert \
$BERT_BASE_DIR/bert_model.ckpt \
$BERT_BASE_DIR/bert_config.json \
$BERT_BASE_DIR/pytorch_model.bin
......@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
pytorch_pretrained_bert gpt \
pytorch_transformers gpt \
$OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
$PYTORCH_DUMP_OUTPUT \
[OPENAI_GPT_CONFIG]
......@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
pytorch_pretrained_bert transfo_xl \
pytorch_transformers transfo_xl \
$TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
$PYTORCH_DUMP_OUTPUT \
[TRANSFO_XL_CONFIG]
......@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 mo
export GPT2_DIR=/path/to/gpt2/checkpoint
pytorch_pretrained_bert gpt2 \
pytorch_transformers gpt2 \
$GPT2_DIR/model.ckpt \
$PYTORCH_DUMP_OUTPUT \
[GPT2_CONFIG]
......@@ -79,7 +79,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
pytorch_pretrained_bert xlnet \
pytorch_transformers xlnet \
$TRANSFO_XL_CHECKPOINT_PATH \
$TRANSFO_XL_CONFIG_PATH \
$PYTORCH_DUMP_OUTPUT \
......
......@@ -4,75 +4,75 @@ BERT
``BertConfig``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertConfig
.. autoclass:: pytorch_transformers.BertConfig
:members:
``BertTokenizer``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertTokenizer
.. autoclass:: pytorch_transformers.BertTokenizer
:members:
``BertAdam``
~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertAdam
.. autoclass:: pytorch_transformers.BertAdam
:members:
1. ``BertModel``
~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertModel
.. autoclass:: pytorch_transformers.BertModel
:members:
2. ``BertForPreTraining``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertForPreTraining
.. autoclass:: pytorch_transformers.BertForPreTraining
:members:
3. ``BertForMaskedLM``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertForMaskedLM
.. autoclass:: pytorch_transformers.BertForMaskedLM
:members:
4. ``BertForNextSentencePrediction``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertForNextSentencePrediction
.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
:members:
5. ``BertForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertForSequenceClassification
.. autoclass:: pytorch_transformers.BertForSequenceClassification
:members:
6. ``BertForMultipleChoice``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertForMultipleChoice
.. autoclass:: pytorch_transformers.BertForMultipleChoice
:members:
7. ``BertForTokenClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertForTokenClassification
.. autoclass:: pytorch_transformers.BertForTokenClassification
:members:
8. ``BertForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.BertForQuestionAnswering
.. autoclass:: pytorch_transformers.BertForQuestionAnswering
:members:
......@@ -4,40 +4,40 @@ OpenAI GPT
``OpenAIGPTConfig``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.OpenAIGPTConfig
.. autoclass:: pytorch_transformers.OpenAIGPTConfig
:members:
``OpenAIGPTTokenizer``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.OpenAIGPTTokenizer
.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
:members:
``OpenAIAdam``
~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.OpenAIAdam
.. autoclass:: pytorch_transformers.OpenAIAdam
:members:
9. ``OpenAIGPTModel``
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.OpenAIGPTModel
.. autoclass:: pytorch_transformers.OpenAIGPTModel
:members:
10. ``OpenAIGPTLMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.OpenAIGPTLMHeadModel
.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
:members:
11. ``OpenAIGPTDoubleHeadsModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.OpenAIGPTDoubleHeadsModel
.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
:members:
......@@ -4,33 +4,33 @@ OpenAI GPT2
``GPT2Config``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.GPT2Config
.. autoclass:: pytorch_transformers.GPT2Config
:members:
``GPT2Tokenizer``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.GPT2Tokenizer
.. autoclass:: pytorch_transformers.GPT2Tokenizer
:members:
14. ``GPT2Model``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.GPT2Model
.. autoclass:: pytorch_transformers.GPT2Model
:members:
15. ``GPT2LMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.GPT2LMHeadModel
.. autoclass:: pytorch_transformers.GPT2LMHeadModel
:members:
16. ``GPT2DoubleHeadsModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.GPT2DoubleHeadsModel
.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
:members:
......@@ -5,26 +5,26 @@ Transformer XL
``TransfoXLConfig``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.TransfoXLConfig
.. autoclass:: pytorch_transformers.TransfoXLConfig
:members:
``TransfoXLTokenizer``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.TransfoXLTokenizer
.. autoclass:: pytorch_transformers.TransfoXLTokenizer
:members:
12. ``TransfoXLModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.TransfoXLModel
.. autoclass:: pytorch_transformers.TransfoXLModel
:members:
13. ``TransfoXLLMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_pretrained_bert.TransfoXLLMHeadModel
.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
:members:
XLM
----------------------------------------------------
``XLMConfig``
~~~~~~~~~~~~~~~~~~~~~
I don't really know what to put here, I'll leave it up to you to decide @Thom
\ No newline at end of file
.. autoclass:: pytorch_transformers.TransfoXLConfig
:members:
17. ``XLMModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMModel
:members:
18. ``XLMWithLMHeadModel``
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
:members:
19. ``XLMForSequenceClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMForSequenceClassification
:members:
20. ``XLMForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
:members:
......@@ -11,7 +11,7 @@ First let's prepare a tokenized input with ``BertTokenizer``
.. code-block:: python
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
......@@ -89,7 +89,7 @@ First let's prepare a tokenized input with ``OpenAIGPTTokenizer``
.. code-block:: python
import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
......@@ -177,7 +177,7 @@ First let's prepare a tokenized input with ``TransfoXLTokenizer``
.. code-block:: python
import torch
from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
from pytorch_transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
......@@ -253,7 +253,7 @@ First let's prepare a tokenized input with ``GPT2Tokenizer``
.. code-block:: python
import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from pytorch_transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
......
......@@ -150,7 +150,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
class BertConfig(PretrainedConfig):
r"""
:class:`~pytorch_pretrained_bert.BertConfig` is the configuration class to store the configuration of a
:class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
`BertModel`.
Arguments:
......@@ -193,6 +193,29 @@ class BertConfig(PretrainedConfig):
layer_norm_eps=1e-12,
**kwargs):
"""Constructs BertConfig.
Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
"""
super(BertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
......@@ -219,6 +242,7 @@ class BertConfig(PretrainedConfig):
"or the path to a pretrained model config file (str)")
try:
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
except ImportError:
......@@ -568,7 +592,7 @@ class BertPreTrainedModel(PreTrainedModel):
class BertModel(BertPreTrainedModel):
r"""BERT model ("Bidirectional Embedding Representations from a Transformer").
:class:`~pytorch_pretrained_bert.BertModel` is the basic BERT Transformer model with a layer of summed token, \
:class:`~pytorch_transformers.BertModel` is the basic BERT Transformer model with a layer of summed token, \
position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 \
for BERT-large). The model is instantiated with the following parameters.
......@@ -605,23 +629,23 @@ class BertModel(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Arguments:
input_ids: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the \
input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length] with the word token indices in the \
vocabulary(see the tokens pre-processing logic in the scripts `run_bert_extract_features.py`, \
`run_bert_classifier.py` and `run_bert_squad.py`)
token_type_ids: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token \
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
a `sentence B` token (see BERT paper for more details).
attention_mask: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices \
attention_mask: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices \
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
input sequence length in the current batch. It's the mask that we typically use for attention when \
a batch has varying length sentences.
output_all_encoded_layers: boolean which controls the content of the `encoded_layers` output as described \
below. Default: `True`.
head_mask: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 \
head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 \
and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 \
=> head is not masked.
......@@ -633,12 +657,12 @@ class BertModel(BertPreTrainedModel):
If ``output_all_encoded_layers`` is set to True, outputs a list of the full sequences of \
encoded-hidden-states at the end of each attention \
block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a\
torch.FloatTensor of size [batch_size, sequence_length, hidden_size].
``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size].
If set to False, outputs only the full sequence of hidden-states corresponding \
to the last attention block of shape [batch_size, sequence_length, hidden_size].
``pooled_output`` is a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a \
``pooled_output`` is a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \
classifier pretrained on top of the hidden state associated to the first character of the \
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
......@@ -731,38 +755,40 @@ class BertForPreTraining(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
next_sentence_label=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
`masked_lm_labels`: optional masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
`next_sentence_label`: optional next sentence classification loss: ``torch.LongTensor`` of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
Either a torch.Tensor or tuple(torch.Tensor, torch.Tensor).
Either a ``torch.Tensor`` or ``tuple(torch.Tensor, torch.Tensor)``.
if ``masked_lm_labels`` and ``next_sentence_label`` are not ``None``, outputs the total_loss which is the \
sum of the masked language modeling loss and the next \
sentence classification loss.
if ``masked_lm_labels`` or ``next_sentence_label` is `None``, outputs a tuple comprising:
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
if ``masked_lm_labels`` or ``next_sentence_label`` is ``None``, outputs a tuple made of:
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]
- the next sentence classification logits of shape [batch_size, 2].
Example ::
......@@ -823,31 +849,31 @@ class BertForMaskedLM(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
`masked_lm_labels`: masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
`head_mask`: an optional ``torch.LongTensor`` of shape [num_heads] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
Masked language modeling loss if `masked_lm_labels` is specified, masked language modeling
Masked language modeling loss if ``masked_lm_labels`` is specified, masked language modeling
logits of shape [batch_size, sequence_length, vocab_size] otherwise.
Example::
......@@ -901,30 +927,30 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
`next_sentence_label`: next sentence classification loss: ``torch.LongTensor`` of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between
0 and 1.It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked,
0.0 => head is not masked.
Returns:
If `next_sentence_label` is specified, outputs the total_loss which is the sum of the masked language \
modeling loss and the next sentence classification loss.
if `next_sentence_label` is `None`, outputs the next sentence classification logits of shape [batch_size, 2].
If ``next_sentence_label`` is specified, outputs the total_loss which is the sum of the masked language
modeling loss and the next sentence classification loss. If ``next_sentence_label`` is ``None``, outputs
the next sentence classification logits of shape [batch_size, 2].
Example::
......@@ -984,27 +1010,27 @@ class BertForSequenceClassification(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
`labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size]
with indices selected in [0, ..., num_labels].
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`, outputs the classification logits of shape `[batch_size, num_labels]`.
If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels].
Example::
......@@ -1070,27 +1096,27 @@ class BertForMultipleChoice(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
`labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size]
with indices selected in [0, ..., num_choices].
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`, outputs the classification logits of shape [batch_size, num_labels].
If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels].
Example::
......@@ -1159,27 +1185,27 @@ class BertForTokenClassification(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
`labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size, sequence_length]
with indices selected in [0, ..., num_labels].
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
Example::
......@@ -1243,6 +1269,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
end_positions=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
......@@ -1260,13 +1288,13 @@ class BertForQuestionAnswering(BertPreTrainedModel):
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
if `start_positions` and `end_positions` are not `None`, outputs the total_loss which is the sum of the
If ``start_positions`` and ``end_positions`` are not ``None``, outputs the total_loss which is the sum of the
CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`, outputs a tuple of start_logits, end_logits which are the
If ``start_positions`` or ``end_positions`` is ``None``, outputs a tuple of start_logits, end_logits which are the
logits respectively for the start and end position tokens of shape [batch_size, sequence_length].
Example::
......
......@@ -101,6 +101,25 @@ def gelu(x):
class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
predict_special_tokens: should we predict special tokens (when the model has a LM head)
"""
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
......@@ -418,9 +437,11 @@ class GPT2Model(GPT2PreTrainedModel):
GPT-2 use a single embedding matrix to store the word and special embeddings.
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
Special tokens need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
The number of special embeddings can be controlled using the `set_num_special_tokens(num_special_tokens)` function.
The embeddings are ordered as follow in the token embeddings matrix:
::
The embeddings are ordered as follow in the token embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
......@@ -428,47 +449,24 @@ class GPT2Model(GPT2PreTrainedModel):
... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
::
total_tokens_embeddings = config.vocab_size + config.n_special
You should use the associate indices to index the embeddings.
Params:
You should use the associated indices to index the embeddings.
Args:
`config`: a GPT2Config class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
(key and values in the attention blocks) to speed up sequential decoding
(this is the presents output of the model, cf. below).
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs a tuple consisting of:
`hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
`presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
torch.FloatTensors. They can be reused to speed up sequential decoding.
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
config = modeling_gpt2.GPT2Config()
model = modeling_gpt2.GPT2Model(config)
hidden_states, presents = model(input_ids)
```
Example::
config = modeling_gpt2.GPT2Config()
model = modeling_gpt2.GPT2Model(config)
"""
def __init__(self, config):
......@@ -485,7 +483,7 @@ class GPT2Model(GPT2PreTrainedModel):
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens=None):
" Update input embeddings with new embedding matrice if needed "
"""Update input embeddings with new embedding matrix if needed."""
if num_special_tokens is None or self.config.n_special == num_special_tokens:
return
# Update config
......@@ -506,6 +504,47 @@ class GPT2Model(GPT2PreTrainedModel):
self.h[layer].attn.prune_heads(heads)
def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
(key and values in the attention blocks) to speed up sequential decoding
(this is the presents output of the model, cf. below).
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
A tuple consisting of ``hidden_states`` and ``presents``.
``hidden_states`` are a list of all the encoded-hidden-states in the model (length of the list: number of
layers + 1 for the output of the embeddings) as ``torch.FloatTensor`` of size [batch_size, sequence_length,
hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of
input_ids).
``presents`` are a list of pre-computed hidden-states (key and values in each attention blocks) as
torch.FloatTensors. They can be reused to speed up sequential decoding.
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
hidden_states, presents = model(input_ids)
# or
hidden_states, presents = model.forward(input_ids)
"""
if past is None:
past_length = 0
past = [None] * len(self.h)
......@@ -580,50 +619,18 @@ class GPT2Model(GPT2PreTrainedModel):
class GPT2LMHeadModel(GPT2PreTrainedModel):
"""OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
Params:
Args:
`config`: a GPT2Config class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
(key and values in the attention blocks) to speed up sequential decoding
(this is the presents output of the model, cf. below).
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
if `lm_labels` is not `None`:
Outputs the language modeling loss.
else a tuple:
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
(or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
`presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
torch.FloatTensors. They can be reused to speed up sequential decoding.
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
config = modeling_gpt2.GPT2Config()
Example::
model = modeling_gpt2.GPT2LMHeadModel(config)
lm_logits, presents = model(input_ids)
```
config = modeling_gpt2.GPT2Config()
model = modeling_gpt2.GPT2LMHeadModel(config)
"""
def __init__(self, config):
......@@ -633,14 +640,58 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" Update input and output embeddings with new embedding matrice
Make sure we are sharing the embeddings
"""
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
TODO Shouldn't we put args + returns ?
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
(key and values in the attention blocks) to speed up sequential decoding
(this is the presents output of the model, cf. below).
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
If ``lm_labels`` is not ``None``, returns the language modeling loss. It ``lm_labels`` is ``None``, returns
a tuple of (``lm_logits``, ``presents``).
``lm_logits`` is the language modeling logits as a ``torch.FloatTensor`` of size [batch_size,
sequence_length, config.vocab_size] (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ...
d_n are the dimension of input_ids).
``presents`` is a list of pre-computed hidden-states (key and values in each attention blocks) as
torch.FloatTensors. They can be reused to speed up sequential decoding.
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
lm_logits, presents = model(input_ids)
# or
lm_logits, presents = model.forward(input_ids)
"""
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
hidden_states = transformer_outputs[0]
......@@ -663,55 +714,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
"""OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
Params:
Args:
`config`: a GPT2Config class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
indices selected in the range [0, config.vocab_size[
`mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., config.vocab_size]
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
`past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
(key and values in the attention blocks) to speed up sequential decoding
(this is the presents output of the model, cf. below).
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
if `lm_labels` and `multiple_choice_labels` are not `None`:
Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
else: a tuple with
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
`multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
`presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
torch.FloatTensors. They can be reused to speed up sequential decoding.
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
Example::
config = modeling_gpt2.GPT2Config()
model = modeling_gpt2.GPT2DoubleHeadsModel(config)
lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
```
config = modeling_gpt2.GPT2Config()
model = modeling_gpt2.GPT2DoubleHeadsModel(config)
"""
def __init__(self, config):
......@@ -723,8 +735,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" Update input and output embeddings with new embedding matrice
Make sure we are sharing the embeddings
"""
Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
TODO Shouldn't we put args + returns ?
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens)
......@@ -732,6 +745,55 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
position_ids=None, past=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
indices selected in the range [0, config.vocab_size[
`mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., config.vocab_size]
`multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
with indices selected in [0, ..., num_choices].
`past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
(key and values in the attention blocks) to speed up sequential decoding
(this is the presents output of the model, cf. below).
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
If ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a
``tuple(language_modeling_loss, multiple_choice_loss)``. If they are not ``None``, outputs a
``tuple(lm_logits, multiple_choice_logits, presents)``.
``lm_logits``: the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, num_choices, sequence_length, config.vocab_size]
``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of size [batch_size, num_choices]
``presents``: a list of pre-computed hidden-states (key and values in each attention blocks) as
torch.FloatTensors. They can be reused to speed up sequential decoding.
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
# or
lm_logits, multiple_choice_logits, presents = model.forward(input_ids, mc_token_ids)
"""
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
hidden_states = transformer_outputs[0]
......
......@@ -127,7 +127,29 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
class OpenAIGPTConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `OpenAIGPTModel`.
"""
Configuration class to store the configuration of a `OpenAIGPTModel`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
afn: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
layer_norm_epsilon: epsilon to use in the layer norm layers
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
predict_special_tokens: should we predict special tokens (when the model has a LM head)
"""
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
......@@ -157,27 +179,6 @@ class OpenAIGPTConfig(PretrainedConfig):
**kwargs
):
"""Constructs OpenAIGPTConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
afn: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
layer_norm_epsilon: epsilon to use in the layer norm layers
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
predict_special_tokens: should we predict special tokens (when the model has a LM head)
"""
super(OpenAIGPTConfig, self).__init__(**kwargs)
......@@ -441,12 +442,16 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
"""OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
OpenAI GPT uses a single embedding matrix to store the word and special embeddings.
Special tokens embeddings are additional tokens that are not pre-trained, such as: [SEP], [CLS]...
Special tokens need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
The embeddings are ordered as follow in the token embeddings matrix:
::
The embeddings are ordered as follow in the token embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
......@@ -454,44 +459,25 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
::
total_tokens_embeddings = config.vocab_size + config.n_special
You should use the associate indices to index the embeddings.
Params:
You should use the associated indices to index the embeddings.
Args:
`config`: a OpenAIGPTConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
`hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
config = modeling_openai.OpenAIGPTConfig()
Example::
model = modeling_openai.OpenAIGPTModel(config)
hidden_states = model(input_ids)
```
config = modeling_openai.OpenAIGPTConfig()
model = modeling_openai.OpenAIGPTModel(config)
"""
def __init__(self, config):
......@@ -507,7 +493,17 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens=None):
" Update input embeddings with new embedding matrice if needed "
"""
Update input embeddings with new embedding matrice if needed
TODO
Args:
num_special_tokens:
Returns:
"""
if num_special_tokens is None or self.config.n_special == num_special_tokens:
return
# Update config
......@@ -528,6 +524,37 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.h[layer].attn.prune_heads(heads)
def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
``hidden_states``, a list of all the encoded-hidden-states in the model (length of the list is number
of layers + 1 for the output of the embeddings)
as ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size]
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
hidden_states = model(input_ids)
# or
hidden_states = model.forward(input_ids)
"""
if position_ids is None:
# This was used when we had a single embedding matrice from position and token embeddings
# start = self.config.vocab_size + self.config.n_special
......@@ -594,10 +621,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
Special tokens need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
Special tokens need to be trained during the fine-tuning if you use them. The number of special embeddings
can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
The embeddings are ordered as follow in the token embeddings matrix:
::
The embeddings are ordered as follow in the token embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
......@@ -605,49 +635,25 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
::
total_tokens_embeddings = config.vocab_size + config.n_special
You should use the associate indices to index the embeddings.
Params:
You should use the associated indices to index the embeddings.
Args:
`config`: a OpenAIGPTConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
if `lm_labels` is not `None`:
Outputs the language modeling loss.
else:
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
(or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
config = modeling_openai.OpenAIGPTConfig()
Example::
model = modeling_openai.OpenAIGPTLMHeadModel(config)
lm_logits = model(input_ids)
```
config = modeling_openai.OpenAIGPTConfig()
model = modeling_openai.OpenAIGPTLMHeadModel(config)
"""
def __init__(self, config):
......@@ -657,14 +663,50 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" Update input and output embeddings with new embedding matrice
Make sure we are sharing the embeddings
"""
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
TODO
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
if ``lm_labels`` is not ``None``, outputs the language modeling loss. Otherwise, outputs ``lm_logits``,
the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, sequence_length,
total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] where d_1 ... d_n are
the dimension of input_ids)
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
lm_logits = model(input_ids)
# or
lm_logits = model.forward(input_ids)
"""
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
......@@ -689,9 +731,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
Special tokens need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)``
function.
The embeddings are ordered as follow in the token embeddings matrix:
::
The embeddings are ordered as follow in the token embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
......@@ -699,54 +745,24 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
::
total_tokens_embeddings = config.vocab_size + config.n_special
You should use the associate indices to index the embeddings.
Params:
Args:
`config`: a OpenAIGPTConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
indices selected in the range [0, total_tokens_embeddings[
`mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., total_tokens_embeddings]
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
if `lm_labels` and `multiple_choice_labels` are not `None`:
Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
else: a tuple with
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
`multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
config = modeling_openai.OpenAIGPTConfig()
model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
```
Example::
config = modeling_openai.OpenAIGPTConfig()
model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
"""
def __init__(self, config):
......@@ -761,6 +777,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" Update input and output embeddings with new embedding matrice
Make sure we are sharing the embeddings
TODO
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens)
......@@ -768,6 +785,50 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
position_ids=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
indices selected in the range [0, total_tokens_embeddings[
`mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
with the position indices (selected in the range [0, config.n_positions - 1[.
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block.
`lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., total_tokens_embeddings]
`multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
with indices selected in [0, ..., num_choices].
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
if ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a tuple of losses with the
language modeling loss and the multiple choice loss. Otherwise, returns a
``tuple(lm_logits, multiple_choice_logits)``.
``lm_logits`` are the language modeling logits as a ``torch.FloatTensor`` of size
[batch_size, num_choices, sequence_length, total_tokens_embeddings]
``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of
size [batch_size, num_choices]
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
# or
lm_logits, multiple_choice_logits = model.forward(input_ids, mc_token_ids)
"""
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
hidden_states = transformer_outputs[0]
......
......@@ -177,6 +177,38 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
cutoffs: cutoffs for the adaptive softmax
d_model: Dimensionality of the model's hidden states.
d_embed: Dimensionality of the embeddings
d_head: Dimensionality of the model's heads.
div_val: divident value for adapative input and softmax
pre_lnorm: apply LayerNorm to the input instead of the output
d_inner: Inner dimension in FF
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
tgt_len: number of tokens to predict
ext_len: length of the extended context
mem_len: length of the retained previous heads
same_length: use the same attn length for all tokens
proj_share_all_but_first: True to share all but first projs, False not to share.
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
clamp_len: use the same pos embeddings after clamp_len
sample_softmax: number of samples in sampled softmax
adaptive: use adaptive softmax
tie_weight: tie the word embedding and softmax weights
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention probabilities.
untie_r: untie relative position biases
embd_pdrop: The dropout ratio for the embeddings.
init: parameter initializer to use
init_range: parameters initialized by U(-init_range, init_range).
proj_init_std: parameters initialized by N(0, init_std)
init_std: parameters initialized by N(0, init_std)
"""
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
......@@ -210,38 +242,6 @@ class TransfoXLConfig(PretrainedConfig):
init_std=0.02,
**kwargs):
"""Constructs TransfoXLConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
cutoffs: cutoffs for the adaptive softmax
d_model: Dimensionality of the model's hidden states.
d_embed: Dimensionality of the embeddings
d_head: Dimensionality of the model's heads.
div_val: divident value for adapative input and softmax
pre_lnorm: apply LayerNorm to the input instead of the output
d_inner: Inner dimension in FF
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
tgt_len: number of tokens to predict
ext_len: length of the extended context
mem_len: length of the retained previous heads
same_length: use the same attn length for all tokens
proj_share_all_but_first: True to share all but first projs, False not to share.
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
clamp_len: use the same pos embeddings after clamp_len
sample_softmax: number of samples in sampled softmax
adaptive: use adaptive softmax
tie_weight: tie the word embedding and softmax weights
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention probabilities.
untie_r: untie relative position biases
embd_pdrop: The dropout ratio for the embeddings.
init: parameter initializer to use
init_range: parameters initialized by U(-init_range, init_range).
proj_init_std: parameters initialized by N(0, init_std)
init_std: parameters initialized by N(0, init_std)
"""
super(TransfoXLConfig, self).__init__(**kwargs)
......@@ -901,42 +901,20 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
class TransfoXLModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
- you don't need to specify positioning embeddings indices
- the tokens in the vocabulary have to be sorted to decreasing frequency.
Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
- you don't need to specify positioning embeddings indices.
Params:
- the tokens in the vocabulary have to be sorted in decreasing frequency.
Args:
config: a TransfoXLConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`mems`: optional memomry of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Outputs:
A tuple of (last_hidden_state, new_mems)
`last_hidden_state`: the encoded-hidden-states at the top of the model
as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
`new_mems`: list (num layers) of updated mem states at the entry of each layer
each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
config = TransfoXLConfig()
model = TransfoXLModel(config)
last_hidden_state, new_mems = model(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, new_mems)
```
Example::
config = TransfoXLConfig()
model = TransfoXLModel(config)
"""
def __init__(self, config):
super(TransfoXLModel, self).__init__(config)
......@@ -1200,18 +1178,40 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
def forward(self, input_ids, mems=None, head_mask=None):
""" Params:
input_ids :: [bsz, len]
mems :: optional mems from previous forwar passes (or init_mems)
list (num layers) of mem states at the entry of each layer
shape :: [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
tuple (last_hidden, new_mems) where:
new_mems: list (num layers) of mem states at the entry of each layer
shape :: [self.config.mem_len, bsz, self.config.d_model]
last_hidden: output of the last layer:
shape :: [bsz, len, self.config.d_model]
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`mems`: optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of ``(last_hidden_state, new_mems)``.
``last_hidden_state``: the encoded-hidden-states at the top of the model
as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, new_mems)
"""
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
......@@ -1227,52 +1227,24 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
This model add an (adaptive) softmax head on top of the TransfoXLModel
This model adds an (adaptive) softmax head on top of the ``TransfoXLModel``
Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
- you don't need to specify positioning embeddings indices
- the tokens in the vocabulary have to be sorted to decreasing frequency.
Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that:
Call self.tie_weights() if you update/load the weights of the transformer to keep the weights tied.
- you don't need to specify positioning embeddings indices
Params:
config: a TransfoXLConfig class instance with the configuration to build a new model
- the tokens in the vocabulary have to be sorted in decreasing frequency.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`labels`: an optional torch.LongTensor of shape [batch_size, sequence_length]
with the labels token indices selected in the range [0, self.config.n_token[
`mems`: an optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Outputs:
A tuple of (last_hidden_state, new_mems)
`softmax_output`: output of the (adaptive) softmax:
if labels is None:
Negative log likelihood of shape [batch_size, sequence_length]
else:
log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
`new_mems`: list (num layers) of updated mem states at the entry of each layer
each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
Args:
config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
config = TransfoXLConfig()
model = TransfoXLModel(config)
last_hidden_state, new_mems = model(input_ids)
Example::
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
```
config = TransfoXLConfig()
model = TransfoXLModel(config)
"""
def __init__(self, config):
super(TransfoXLLMHeadModel, self).__init__(config)
......@@ -1290,7 +1262,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
self.tie_weights()
def tie_weights(self):
""" Run this to be sure output and input (adaptive) softmax weights are tied """
"""
Run this to be sure output and input (adaptive) softmax weights are tied
"""
# sampled softmax
if self.sample_softmax > 0:
if self.config.tie_weight:
......@@ -1314,18 +1288,43 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(data)
def forward(self, input_ids, labels=None, mems=None, head_mask=None):
""" Params:
input_ids :: [bsz, len]
labels :: [bsz, len]
Returns:
tuple(softmax_output, new_mems) where:
new_mems: list (num layers) of hidden states at the entry of each layer
shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids
softmax_output: output of the (adaptive) softmax:
if labels is None:
Negative log likelihood of shape :: [bsz, len]
else:
log probabilities of tokens, shape :: [bsz, len, n_tokens]
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the labels token indices selected in the range [0, self.config.n_token[
`mems`: an optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of (last_hidden_state, new_mems)
``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
tokens of, shape [batch_size, sequence_length, n_tokens].
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
"""
bsz = input_ids.size(0)
tgt_len = input_ids.size(1)
......
......@@ -45,6 +45,46 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
d_inner: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
untie_r: untie relative position biases
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping.
same_length: bool, whether to use the same attention length for each token.
"""
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
......@@ -83,46 +123,6 @@ class XLMConfig(PretrainedConfig):
end_n_top=5,
**kwargs):
"""Constructs XLMConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
d_inner: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
untie_r: untie relative position biases
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping.
same_length: bool, whether to use the same attention length for each token.
"""
super(XLMConfig, self).__init__(**kwargs)
......@@ -377,64 +377,33 @@ class XLMPreTrainedModel(PreTrainedModel):
class XLMModel(XLMPreTrainedModel):
"""
XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads',
'hidden_dim', 'dropout', 'attention_dropout', 'asm',
'asm_cutoffs', 'asm_div_value']
def __init__(self, config): #, dico, is_encoder, with_output):
""" XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
Paper: https://arxiv.org/abs/1901.07291
Original code: https://github.com/facebookresearch/XLM
Params:
`config`: a XLMConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Paper: https://arxiv.org/abs/1901.07291
Original code: https://github.com/facebookresearch/XLM
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
Args:
`config`: a XLMConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
Example::
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLMModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
"""
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads',
'hidden_dim', 'dropout', 'attention_dropout', 'asm',
'asm_cutoffs', 'asm_div_value']
def __init__(self, config): #, dico, is_encoder, with_output):
super(XLMModel, self).__init__(config)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
......@@ -507,12 +476,53 @@ class XLMModel(XLMPreTrainedModel):
def forward(self, input_ids, lengths=None, positions=None, langs=None,
token_type_ids=None, attention_mask=None, cache=None, head_mask=None): # src_enc=None, src_len=None,
"""
Inputs:
`input_ids` LongTensor(bs, slen), containing word indices
`lengths` LongTensor(bs), containing the length of each sentence
`positions` LongTensor(bs, slen), containing word positions
`langs` LongTensor(bs, slen), containing language IDs
`token_type_ids` LongTensor (bs, slen) same as `langs` used for compatibility
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Parameters:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`lengths`: ``torch.LongTensor`` of size ``bs``, containing the length of each sentence
`positions`: ``torch.LongTensor`` of size ``(bs, slen)``, containing word positions
`langs`: ``torch.LongTensor`` of size ``(bs, slen)``, containing language IDs
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`cache`: TODO
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
A ``tuple(encoded_layers, pooled_output)``, with
``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
- ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states at the end \
of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
- ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
to the last attention block of shape [batch_size, sequence_length, hidden_size],
``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a
classifier pre-trained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
"""
if lengths is None:
lengths = (input_ids != self.pad_index).sum(dim=1).long()
......@@ -674,55 +684,23 @@ class XLMPredLayer(nn.Module):
class XLMWithLMHeadModel(XLMPreTrainedModel):
""" XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
Paper: https://arxiv.org/abs/1901.07291
Original code: https://github.com/facebookresearch/XLM
Params:
Paper: https://arxiv.org/abs/1901.07291
Original code: https://github.com/facebookresearch/XLM
Args:
`config`: a XLMConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
Example::
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLMModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
model = modeling.XLMModel(config=config)
"""
def __init__(self, config):
super(XLMWithLMHeadModel, self).__init__(config)
......@@ -746,29 +724,51 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
attention_mask=None, cache=None, labels=None, head_mask=None):
"""
Args:
inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
input_mask: float32 Tensor in shape [bsz, len], the input mask.
0 for real tokens and 1 for padding.
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
from previous batches. The length of the list equals n_layer.
If None, no memory is used.
perm_mask: float32 Tensor in shape [bsz, len, len].
If perm_mask[k, i, j] = 0, i attend to j in batch k;
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
If None, each position attends to all the others.
target_mapping: float32 Tensor in shape [bsz, num_predict, len].
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
on the j-th token.
Only used during pretraining for partial prediction.
Set to None during finetuning.
inp_q: float32 Tensor in shape [bsz, len].
1 for tokens with losses and 0 for tokens without losses.
Only used during pretraining for two-stream attention.
Set to None during finetuning.
summary_type: str, "last", "first", "mean", or "attn". The method
to pool the input to get a vector representation.
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`lengths`: TODO
`positions`: TODO
`langs`: TODO
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`cache`: TODO
`labels`: TODO
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
A ``tuple(encoded_layers, pooled_output)``, with
``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
If ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states \
at the end of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
If ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
to the last attention block of shape [batch_size, sequence_length, hidden_size],
``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \
classifier pre-trained on top of the hidden state associated to the first character of the \
input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
......@@ -783,7 +783,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
class XLMForSequenceClassification(XLMPreTrainedModel):
"""XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding").
Params:
Args:
`config`: a XLMConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
......@@ -791,58 +791,15 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
`summary_type`: str, "last", "first", "mean", or "attn". The method
to pool the input to get a vector representation. Default: last
Inputs:
inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
input_mask: float32 Tensor in shape [bsz, len], the input mask.
0 for real tokens and 1 for padding.
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the XLM model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
from previous batches. The length of the list equals n_layer.
If None, no memory is used.
perm_mask: float32 Tensor in shape [bsz, len, len].
If perm_mask[k, i, j] = 0, i attend to j in batch k;
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
If None, each position attends to all the others.
target_mapping: float32 Tensor in shape [bsz, num_predict, len].
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
on the j-th token.
Only used during pretraining for partial prediction.
Set to None during finetuning.
inp_q: float32 Tensor in shape [bsz, len].
1 for tokens with losses and 0 for tokens without losses.
Only used during pretraining for two-stream attention.
Set to None during finetuning.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Tuple of (logits or loss, mems)
`logits or loss`:
if labels is None:
Token logits with shape [batch_size, sequence_length]
else:
CrossEntropy loss with the targets
`new_mems`: list (num layers) of updated mem states at the entry of each layer
each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
n_layer=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLMModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
Example::
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
n_layer=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLMModel(config=config)
"""
def __init__(self, config):
super(XLMForSequenceClassification, self).__init__(config)
......@@ -857,30 +814,36 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
attention_mask=None, cache=None, labels=None, head_mask=None):
"""
Args:
inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
input_ids: TODO
lengths: TODO
positions: TODO
langs: TODO
token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
input_mask: float32 Tensor in shape [bsz, len], the input mask.
0 for real tokens and 1 for padding.
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the XLM model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
from previous batches. The length of the list equals n_layer.
If None, no memory is used.
perm_mask: float32 Tensor in shape [bsz, len, len].
If perm_mask[k, i, j] = 0, i attend to j in batch k;
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
If None, each position attends to all the others.
target_mapping: float32 Tensor in shape [bsz, num_predict, len].
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
on the j-th token.
Only used during pretraining for partial prediction.
Set to None during finetuning.
inp_q: float32 Tensor in shape [bsz, len].
1 for tokens with losses and 0 for tokens without losses.
Only used during pretraining for two-stream attention.
Set to None during finetuning.
cache: TODO
labels: TODO
head_mask: TODO
Returns:
A ``tuple(logits_or_loss, new_mems)``. If ``labels`` is ``None``, return token logits with shape
[batch_size, sequence_length]. If it isn't ``None``, return the ``CrossEntropy`` loss with the targets.
``new_mems`` is a list (num layers) of updated mem states at the entry of each layer \
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model] \
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and ``labels``
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
......@@ -904,60 +867,25 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
class XLMForQuestionAnswering(XLMPreTrainedModel):
"""XLM model for Question Answering (span extraction).
"""
XLM model for Question Answering (span extraction).
This module is composed of the XLM model with a linear layer on top of
the sequence output that computes start_logits and end_logits
Params:
Args:
`config`: a XLMConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
`attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the XLM model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
`input_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
if `start_positions` and `end_positions` are not `None`:
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`:
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = XLMForQuestionAnswering(config)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
Example::
config = XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = XLMForQuestionAnswering(config)
"""
def __init__(self, config):
super(XLMForQuestionAnswering, self).__init__(config)
......@@ -971,6 +899,58 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
attention_mask=None, cache=None, start_positions=None, end_positions=None,
cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
lengths: TODO
positions: TODO
langs: TODO
token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the XLM model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
cache: TODO
start_positions: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
end_positions: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
cls_index: TODO
is_impossible: TODO
p_mask: TODO
head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
Either the ``total_loss`` or a ``tuple(start_logits, end_logits)``
if ``start_positions`` and ``end_positions`` are not ``None``, \
outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if ``start_positions`` or ``end_positions`` is ``None``:
Outputs a ``tuple(start_logits, end_logits)`` which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
# or
start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
......
......@@ -958,10 +958,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, d_model],
encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, d_model],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, d_model],
`pooled_output`: a torch.FloatTensor of size [batch_size, d_model] which is the output of a
`pooled_output`: a ``torch.FloatTensor`` of size [batch_size, d_model] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
......@@ -1087,7 +1087,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
1 for tokens with losses and 0 for tokens without losses.
Only used during pretraining for two-stream attention.
Set to None during finetuning.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
......@@ -1098,7 +1098,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
else:
CrossEntropy loss with the targets
`new_mems`: list (num layers) of updated mem states at the entry of each layer
each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Example usage:
......@@ -1189,27 +1189,27 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLNet paper for more details).
`attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the BERT model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
`input_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
`input_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
`start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
`end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment