Commit 44c985fa authored by thomwolf's avatar thomwolf
Browse files

update doc for XLM and XLNet

parent 0201d860
...@@ -611,11 +611,11 @@ BERT_INPUTS_DOCSTRING = r""" ...@@ -611,11 +611,11 @@ BERT_INPUTS_DOCSTRING = r"""
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
""" """
...@@ -714,7 +714,7 @@ class BertModel(BertPreTrainedModel): ...@@ -714,7 +714,7 @@ class BertModel(BertPreTrainedModel):
return outputs # sequence_output, pooled_output, (hidden_states), (attentions) return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer BERT model with two heads on top as done during the pre-training: @add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
a `masked language modeling` head and a `next sentence prediction (classification)` head. """, a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertForPreTraining(BertPreTrainedModel): class BertForPreTraining(BertPreTrainedModel):
...@@ -791,7 +791,7 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -791,7 +791,7 @@ class BertForPreTraining(BertPreTrainedModel):
return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """, @add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertForMaskedLM(BertPreTrainedModel): class BertForMaskedLM(BertPreTrainedModel):
r""" r"""
...@@ -856,7 +856,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -856,7 +856,7 @@ class BertForMaskedLM(BertPreTrainedModel):
return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """, @add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertForNextSentencePrediction(BertPreTrainedModel): class BertForNextSentencePrediction(BertPreTrainedModel):
r""" r"""
...@@ -913,7 +913,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -913,7 +913,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer BERT model with a sequence classification/regression head on top (a linear layer on top of @add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertForSequenceClassification(BertPreTrainedModel): class BertForSequenceClassification(BertPreTrainedModel):
...@@ -981,7 +981,7 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -981,7 +981,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions) return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer BERT model with a multiple choice classification head on top (a linear layer on top of @add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
BERT_START_DOCSTRING) BERT_START_DOCSTRING)
class BertForMultipleChoice(BertPreTrainedModel): class BertForMultipleChoice(BertPreTrainedModel):
...@@ -1016,11 +1016,11 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1016,11 +1016,11 @@ class BertForMultipleChoice(BertPreTrainedModel):
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``: **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score. The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
...@@ -1087,7 +1087,7 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1087,7 +1087,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer BERT model with a token classification head on top (a linear layer on top of @add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertForTokenClassification(BertPreTrainedModel): class BertForTokenClassification(BertPreTrainedModel):
...@@ -1154,17 +1154,17 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1154,17 +1154,17 @@ class BertForTokenClassification(BertPreTrainedModel):
return outputs # (loss), scores, (hidden_states), (attentions) return outputs # (loss), scores, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer BERT model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of @add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertForQuestionAnswering(BertPreTrainedModel): class BertForQuestionAnswering(BertPreTrainedModel):
r""" r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Position (index) of the start of the labelled span for computing the token classification loss. Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
......
...@@ -404,11 +404,11 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: ...@@ -404,11 +404,11 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
(see `past` output below). Can be used to speed up sequential decoding. (see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
""" """
...@@ -541,7 +541,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -541,7 +541,7 @@ class GPT2Model(GPT2PreTrainedModel):
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2LMHeadModel(GPT2PreTrainedModel): class GPT2LMHeadModel(GPT2PreTrainedModel):
r""" r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling. Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]`` Indices are selected in ``[-1, 0, ..., config.vocab_size]``
...@@ -549,7 +549,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -549,7 +549,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss. Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
...@@ -571,7 +571,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -571,7 +571,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = GPT2LMHeadModel(config) >>> model = GPT2LMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, lm_labels=input_ids) >>> outputs = model(input_ids, labels=input_ids)
>>> loss, logits = outputs[:2] >>> loss, logits = outputs[:2]
""" """
...@@ -590,17 +590,17 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -590,17 +590,17 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self._tie_or_clone_weights(self.lm_head, self._tie_or_clone_weights(self.lm_head,
self.transformer.wte) self.transformer.wte)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None): def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask) transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
outputs = (lm_logits,) + transformer_outputs[1:] outputs = (lm_logits,) + transformer_outputs[1:]
if lm_labels is not None: if labels is not None:
# Shift so that tokens < n predict n # Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous() shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous() shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens # Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-1) loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
...@@ -639,11 +639,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -639,11 +639,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
(see `past` output below). Can be used to speed up sequential decoding. (see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``: **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling. Labels for language modeling.
......
...@@ -414,11 +414,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: ...@@ -414,11 +414,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
""" """
...@@ -536,7 +536,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -536,7 +536,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
r""" r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling. Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]`` Indices are selected in ``[-1, 0, ..., config.vocab_size]``
...@@ -544,7 +544,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -544,7 +544,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss. Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
...@@ -562,7 +562,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -562,7 +562,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
>>> model = OpenAIGPTLMHeadModel(config) >>> model = OpenAIGPTLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, lm_labels=input_ids) >>> outputs = model(input_ids, labels=input_ids)
>>> loss, logits = outputs[:2] >>> loss, logits = outputs[:2]
""" """
...@@ -581,16 +581,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -581,16 +581,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
self._tie_or_clone_weights(self.lm_head, self._tie_or_clone_weights(self.lm_head,
self.transformer.tokens_embed) self.transformer.tokens_embed)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None): def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask) transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
outputs = (lm_logits,) + transformer_outputs[1:] outputs = (lm_logits,) + transformer_outputs[1:]
if lm_labels is not None: if labels is not None:
# Shift so that tokens < n predict n # Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous() shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous() shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens # Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-1) loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
...@@ -625,11 +625,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -625,11 +625,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``: **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling. Labels for language modeling.
......
...@@ -937,13 +937,13 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -937,13 +937,13 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`. Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mems**: **mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer): list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context. (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
""" """
...@@ -954,7 +954,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -954,7 +954,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model. Sequence of hidden-states at the last layer of the model.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **mems**:
list of ``torch.FloatTensor`` (one for each layer): list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context. (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
...@@ -1270,7 +1270,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1270,7 +1270,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
**prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
We don't output them when the loss is computed to speedup adaptive softmax decoding. We don't output them when the loss is computed to speedup adaptive softmax decoding.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **mems**:
list of ``torch.FloatTensor`` (one for each layer): list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context. (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
......
...@@ -538,7 +538,6 @@ class PoolerAnswerClass(nn.Module): ...@@ -538,7 +538,6 @@ class PoolerAnswerClass(nn.Module):
class SQuADHead(nn.Module): class SQuADHead(nn.Module):
""" A SQuAD head inspired by XLNet. """ A SQuAD head inspired by XLNet.
Compute
""" """
def __init__(self, config): def __init__(self, config):
super(SQuADHead, self).__init__() super(SQuADHead, self).__init__()
......
...@@ -30,7 +30,7 @@ from torch import nn ...@@ -30,7 +30,7 @@ from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_utils import (PretrainedConfig, PreTrainedModel, from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings,
prune_linear_layer, SequenceSummary, SQuADHead) prune_linear_layer, SequenceSummary, SQuADHead)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -392,28 +392,94 @@ class XLMPreTrainedModel(PreTrainedModel): ...@@ -392,28 +392,94 @@ class XLMPreTrainedModel(PreTrainedModel):
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
class XLMModel(XLMPreTrainedModel): XLM_START_DOCSTRING = r""" The XLM model was proposed in
""" `Cross-lingual Language Model Pretraining`_
XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
Paper: https://arxiv.org/abs/1901.07291 - a causal language modeling (CLM) objective (next token prediction),
- a masked language modeling (MLM) objective (Bert-like), or
- a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
Original code: https://github.com/facebookresearch/XLM Original code can be found `here`_.
Args: This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
`config`: a XLMConfig class instance with the configuration to build a new model refer to the PyTorch documentation for all matter related to general usage and behavior.
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Example:: .. _`Cross-lingual Language Model Pretraining`:
https://arxiv.org/abs/1901.07291
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768, .. _`torch.nn.Module`:
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) https://pytorch.org/docs/stable/nn.html#module
model = modeling.XLMModel(config=config) .. _`here`:
""" https://github.com/facebookresearch/XLM
Parameters:
config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
"""
XLM_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens to be used to indicate the language of each token in the input.
Indices are selected in the pre-trained language vocabulary,
i.e. in the range ``[0, config.n_langs - 1[``.
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Length of each sentence that can be used to avoid performing attention on padding token indices.
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
Indices selected in ``[0, ..., input_ids.size(-1)]``:
**cache**:
dictionary with ``torch.FloatTensor`` that contains pre-computed
hidden-states (key and values in the attention blocks) as computed by the model
(see `cache` output below). Can be used to speed up sequential decoding.
The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMModel(XLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output', ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads',
'hidden_dim', 'dropout', 'attention_dropout', 'asm', 'hidden_dim', 'dropout', 'attention_dropout', 'asm',
...@@ -493,57 +559,8 @@ class XLMModel(XLMPreTrainedModel): ...@@ -493,57 +559,8 @@ class XLMModel(XLMPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads) self.attentions[layer].prune_heads(heads)
def forward(self, input_ids, lengths=None, positions=None, langs=None, def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
token_type_ids=None, attention_mask=None, cache=None, head_mask=None): # src_enc=None, src_len=None, token_type_ids=None, attention_mask=None, cache=None, head_mask=None): # src_enc=None, src_len=None,
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Parameters:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`lengths`: ``torch.LongTensor`` of size ``bs``, containing the length of each sentence
`positions`: ``torch.LongTensor`` of size ``(bs, slen)``, containing word positions
`langs`: ``torch.LongTensor`` of size ``(bs, slen)``, containing language IDs
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`cache`: TODO
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
A ``tuple(encoded_layers, pooled_output)``, with
``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
- ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states at the end \
of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
- ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
to the last attention block of shape [batch_size, sequence_length, hidden_size],
``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a
classifier pre-trained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
"""
if lengths is None: if lengths is None:
lengths = (input_ids != self.pad_index).sum(dim=1).long() lengths = (input_ids != self.pad_index).sum(dim=1).long()
# mask = input_ids != self.pad_index # mask = input_ids != self.pad_index
...@@ -563,18 +580,15 @@ class XLMModel(XLMPreTrainedModel): ...@@ -563,18 +580,15 @@ class XLMModel(XLMPreTrainedModel):
# if self.is_decoder and src_enc is not None: # if self.is_decoder and src_enc is not None:
# src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
# positions # position_ids
if positions is None: if position_ids is None:
positions = input_ids.new((slen,)).long() position_ids = input_ids.new((slen,)).long()
positions = torch.arange(slen, out=positions).unsqueeze(0) position_ids = torch.arange(slen, out=position_ids).unsqueeze(0)
else: else:
assert positions.size() == (bs, slen) # (slen, bs) assert position_ids.size() == (bs, slen) # (slen, bs)
# positions = positions.transpose(0, 1) # position_ids = position_ids.transpose(0, 1)
# langs # langs
assert langs is None or token_type_ids is None, "You can only use one among langs and token_type_ids"
if token_type_ids is not None:
langs = token_type_ids
if langs is not None: if langs is not None:
assert langs.size() == (bs, slen) # (slen, bs) assert langs.size() == (bs, slen) # (slen, bs)
# langs = langs.transpose(0, 1) # langs = langs.transpose(0, 1)
...@@ -598,7 +612,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -598,7 +612,7 @@ class XLMModel(XLMPreTrainedModel):
if cache is not None: if cache is not None:
_slen = slen - cache['slen'] _slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:] input_ids = input_ids[:, -_slen:]
positions = positions[:, -_slen:] position_ids = position_ids[:, -_slen:]
if langs is not None: if langs is not None:
langs = langs[:, -_slen:] langs = langs[:, -_slen:]
mask = mask[:, -_slen:] mask = mask[:, -_slen:]
...@@ -606,9 +620,11 @@ class XLMModel(XLMPreTrainedModel): ...@@ -606,9 +620,11 @@ class XLMModel(XLMPreTrainedModel):
# embeddings # embeddings
tensor = self.embeddings(input_ids) tensor = self.embeddings(input_ids)
tensor = tensor + self.position_embeddings(positions).expand_as(tensor) tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
if langs is not None: if langs is not None:
tensor = tensor + self.lang_embeddings(langs) tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None:
tensor = tensor + self.embeddings(token_type_ids)
tensor = self.layer_norm_emb(tensor) tensor = self.layer_norm_emb(tensor)
tensor = F.dropout(tensor, p=self.dropout, training=self.training) tensor = F.dropout(tensor, p=self.dropout, training=self.training)
tensor *= mask.unsqueeze(-1).to(tensor.dtype) tensor *= mask.unsqueeze(-1).to(tensor.dtype)
...@@ -702,25 +718,40 @@ class XLMPredLayer(nn.Module): ...@@ -702,25 +718,40 @@ class XLMPredLayer(nn.Module):
return outputs return outputs
@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMWithLMHeadModel(XLMPreTrainedModel): class XLMWithLMHeadModel(XLMPreTrainedModel):
""" XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Paper: https://arxiv.org/abs/1901.07291 Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMWithLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
Original code: https://github.com/facebookresearch/XLM
Args:
`config`: a XLMConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Example::
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLMModel(config=config)
""" """
def __init__(self, config): def __init__(self, config):
super(XLMWithLMHeadModel, self).__init__(config) super(XLMWithLMHeadModel, self).__init__(config)
...@@ -735,57 +766,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -735,57 +766,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
""" """
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings) self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None, def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
attention_mask=None, cache=None, labels=None, head_mask=None): attention_mask=None, cache=None, labels=None, head_mask=None):
""" transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`lengths`: TODO
`positions`: TODO
`langs`: TODO
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
`attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`cache`: TODO
`labels`: TODO
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
A ``tuple(encoded_layers, pooled_output)``, with
``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
If ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states \
at the end of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
If ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
to the last attention block of shape [batch_size, sequence_length, hidden_size],
``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \
classifier pre-trained on top of the hidden state associated to the first character of the \
input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask) langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -795,25 +778,40 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -795,25 +778,40 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
return outputs return outputs
@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMForSequenceClassification(XLMPreTrainedModel): class XLMForSequenceClassification(XLMPreTrainedModel):
"""XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding"). r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Args: Labels for computing the sequence classification/regression loss.
`config`: a XLMConfig class instance with the configuration to build a new model Indices should be in ``[0, ..., config.num_labels]``.
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
This can be used to compute head importance metrics. Default: False
`summary_type`: str, "last", "first", "mean", or "attn". The method Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
to pool the input to get a vector representation. Default: last **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification (or regression if config.num_labels==1) loss.
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
Example:: **attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768, Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
n_layer=12, num_attention_heads=12, intermediate_size=3072) **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
model = modeling.XLMModel(config=config) of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>>
>>> model = XLMForSequenceClassification(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=labels)
>>> loss, logits = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -825,42 +823,9 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -825,42 +823,9 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None, def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
attention_mask=None, cache=None, labels=None, head_mask=None): attention_mask=None, cache=None, labels=None, head_mask=None):
""" transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
Args:
input_ids: TODO
lengths: TODO
positions: TODO
langs: TODO
token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the XLM model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
cache: TODO
labels: TODO
head_mask: TODO
Returns:
A ``tuple(logits_or_loss, new_mems)``. If ``labels`` is ``None``, return token logits with shape
[batch_size, sequence_length]. If it isn't ``None``, return the ``CrossEntropy`` loss with the targets.
``new_mems`` is a list (num layers) of updated mem states at the entry of each layer \
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model] \
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and ``labels``
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask) langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -881,26 +846,53 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -881,26 +846,53 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
return outputs return outputs
@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMForQuestionAnswering(XLMPreTrainedModel): class XLMForQuestionAnswering(XLMPreTrainedModel):
""" r"""
XLM model for Question Answering (span extraction). **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
This module is composed of the XLM model with a linear layer on top of Labels for position (index) of the start of the labelled span for computing the token classification loss.
the sequence output that computes start_logits and end_logits Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels whether a question has an answer or no answer (SQuAD 2.0)
**cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
**p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>>
>>> model = XLMForQuestionAnswering(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:2]
Args:
`config`: a XLMConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Example::
config = XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = XLMForQuestionAnswering(config)
""" """
def __init__(self, config): def __init__(self, config):
super(XLMForQuestionAnswering, self).__init__(config) super(XLMForQuestionAnswering, self).__init__(config)
...@@ -910,63 +902,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -910,63 +902,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None, def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
attention_mask=None, cache=None, start_positions=None, end_positions=None, attention_mask=None, cache=None, start_positions=None, end_positions=None,
cls_index=None, is_impossible=None, p_mask=None, head_mask=None): cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
lengths: TODO
positions: TODO
langs: TODO
token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLM paper for more details).
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the XLM model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
cache: TODO
start_positions: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
end_positions: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
cls_index: TODO
is_impossible: TODO
p_mask: TODO
head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
Either the ``total_loss`` or a ``tuple(start_logits, end_logits)``
if ``start_positions`` and ``end_positions`` are not ``None``, \
outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if ``start_positions`` or ``end_positions`` is ``None``:
Outputs a ``tuple(start_logits, end_logits)`` which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
# or
start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask) langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
......
...@@ -15,8 +15,6 @@ ...@@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
""" PyTorch XLNet model. """ PyTorch XLNet model.
""" """
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
...@@ -32,7 +30,8 @@ from torch.nn import functional as F ...@@ -32,7 +30,8 @@ from torch.nn import functional as F
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits) SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits,
add_start_docstrings)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -619,26 +618,105 @@ class XLNetPreTrainedModel(PreTrainedModel): ...@@ -619,26 +618,105 @@ class XLNetPreTrainedModel(PreTrainedModel):
module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range) module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
class XLNetModel(XLNetPreTrainedModel): XLNET_START_DOCSTRING = r""" The XLNet model was proposed in
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding"). `XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
to learn bidirectional contexts by maximizing the expected likelihood over all permutations
of the input sequence factorization order.
TODO Lysandre filled: this was copied from the XLNetLMHeadModel, check that it's ok. The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
Args: Do to the difficulty of training a fully auto-regressive model over various factorization order,
`config`: a XLNetConfig class instance with the configuration to build a new model XLNet is pretrained using only a sub-set of the output tokens as target which are selected
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False with the `target_mapping` input.
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
`target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior.
Example:: .. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
http://arxiv.org/abs/1906.08237
config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768, .. _`torch.nn.Module`:
n_layer=12, num_attention_heads=12, intermediate_size=3072) https://pytorch.org/docs/stable/nn.html#module
model = modeling.XLNetModel(config=config) Parameters:
config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
"""
XLNET_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**input_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
**perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
If None, each token attends to all the others (full bidirectional attention).
Only used during pretraining (to define factorization order) or for sequential decoding (generation).
**target_mapping**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_predict, sequence_length)``:
Mask to indicate the output tokens to use.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
Only used during pretraining for partial prediction or for sequential decoding (generation).
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetModel(XLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>> model = XLNetModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
TODO Lysandre filled: Added example usage
""" """
def __init__(self, config): def __init__(self, config):
super(XLNetModel, self).__init__(config) super(XLNetModel, self).__init__(config)
...@@ -765,50 +843,6 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -765,50 +843,6 @@ class XLNetModel(XLNetPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None, head_mask=None): mems=None, perm_mask=None, target_mapping=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
0 for real tokens and 1 for padding.
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the BERT model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
from previous batches. The length of the list equals n_layer.
If None, no memory is used.
perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
If perm_mask[k, i, j] = 0, i attend to j in batch k;
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
If None, each position attends to all the others.
target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
on the j-th token.
Only used during pretraining for partial prediction.
Set to None during finetuning.
head_mask: TODO Lysandre didn't fill
Returns:
TODO Lysandre didn't fill: Missing returns!
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
TODO Lysandre filled: Filled with the LMHead example, is probably different since it has a different output
"""
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end # so we move here the first dimension (batch) to the end
...@@ -952,23 +986,49 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -952,23 +986,49 @@ class XLNetModel(XLNetPreTrainedModel):
return outputs # outputs, new_mems, (hidden_states), (attentions) return outputs # outputs, new_mems, (hidden_states), (attentions)
@add_start_docstrings("""XLNet Model with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetLMHeadModel(XLNetPreTrainedModel): class XLNetLMHeadModel(XLNetPreTrainedModel):
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding"). r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Args: Labels for language modeling.
`config`: a XLNetConfig class instance with the configuration to build a new model Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False Indices are selected in ``[-1, 0, ..., config.vocab_size]``
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. All labels set to ``-1`` are ignored (masked), the loss is only
This can be used to compute head importance metrics. Default: False computed for labels in ``[0, ..., config.vocab_size]``
Example:: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768, Language modeling loss.
n_layer=12, num_attention_heads=12, intermediate_size=3072) **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>> model = XLNetLMHeadModel(config)
>>> # We show how to setup inputs to predict a next token using a bi-directional context.
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0) # We will predict the masked token
>>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
>>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
>>> next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
model = modeling.XLNetLMHeadModel(config=config)
TODO Lysandre modified: Changed XLNetModel to XLNetLMHeadModel in the example
""" """
def __init__(self, config): def __init__(self, config):
super(XLNetLMHeadModel, self).__init__(config) super(XLNetLMHeadModel, self).__init__(config)
...@@ -989,58 +1049,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -989,58 +1049,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None, mems=None, perm_mask=None, target_mapping=None,
labels=None, head_mask=None): labels=None, head_mask=None):
"""
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
Args:
input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
0 for real tokens and 1 for padding.
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the BERT model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
from previous batches. The length of the list equals n_layer.
If None, no memory is used.
perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
If perm_mask[k, i, j] = 0, i attend to j in batch k;
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
If None, each position attends to all the others.
target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
on the j-th token.
Only used during pretraining for partial prediction.
Set to None during finetuning.
Returns:
A ``tuple(encoded_layers, pooled_output)``, with
``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
- ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states \
at the end of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), \
each encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, d_model],
- ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
to the last attention block of shape [batch_size, sequence_length, d_model],
``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, d_model] which is the output of a \
classifier pretrained on top of the hidden state associated to the first character of the \
input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask, transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
mems, perm_mask, target_mapping, head_mask) mems, perm_mask, target_mapping, head_mask)
...@@ -1055,30 +1063,48 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1055,30 +1063,48 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
labels.view(-1)) labels.view(-1))
outputs = (loss,) + outputs outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions) return outputs # return (loss), logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetForSequenceClassification(XLNetPreTrainedModel): class XLNetForSequenceClassification(XLNetPreTrainedModel):
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding"). r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification (or regression if config.num_labels==1) loss.
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>>
>>> model = XLNetForSequenceClassification(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=labels)
>>> loss, logits = outputs[:2]
Args:
`config`: a XLNetConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
`summary_type`: str, "last", "first", "mean", or "attn". The method
to pool the input to get a vector representation. Default: last
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
""" """
def __init__(self, config): def __init__(self, config):
super(XLNetForSequenceClassification, self).__init__(config) super(XLNetForSequenceClassification, self).__init__(config)
...@@ -1093,57 +1119,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1093,57 +1119,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None, mems=None, perm_mask=None, target_mapping=None,
labels=None, head_mask=None): labels=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
input_mask: float32 Tensor in shape [bsz, len], the input mask.
0 for real tokens and 1 for padding.
attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the BERT model (which uses this negative masking).
You can only uses one among `input_mask` and `attention_mask`
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
from previous batches. The length of the list equals n_layer.
If None, no memory is used.
perm_mask: float32 Tensor in shape [bsz, len, len].
If perm_mask[k, i, j] = 0, i attend to j in batch k;
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
If None, each position attends to all the others.
target_mapping: float32 Tensor in shape [bsz, num_predict, len].
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
on the j-th token.
Only used during pre-training for partial prediction.
Set to None during fine-tuning.
labels: TODO Lysandre didn't fill
head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
A ``tuple(logits_or_loss, mems)``
``logits_or_loss``: if ``labels`` is ``None``, ``logits_or_loss`` corresponds to token logits with shape \
[batch_size, sequence_length]. If it is not ``None``, it corresponds to the ``CrossEntropy`` loss \
with the targets.
``new_mems``: list (num layers) of updated mem states at the entry of each layer \
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model] \
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and ``labels``
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask, transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
mems, perm_mask, target_mapping, head_mask) mems, perm_mask, target_mapping, head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1163,28 +1138,60 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1163,28 +1138,60 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions) return outputs # return (loss), logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetForQuestionAnswering(XLNetPreTrainedModel): class XLNetForQuestionAnswering(XLNetPreTrainedModel):
""" r"""
XLNet model for Question Answering (span extraction). **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels whether a question has an answer or no answer (SQuAD 2.0)
**cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
**p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>>
>>> model = XLMForQuestionAnswering(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:2]
This module is composed of the XLNet model with a linear layer on top of
the sequence output that computes ``start_logits`` and ``end_logits``
Args:
`config`: a XLNetConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Example::
config = XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = XLNetForQuestionAnswering(config)
""" """
def __init__(self, config): def __init__(self, config):
super(XLNetForQuestionAnswering, self).__init__(config) super(XLNetForQuestionAnswering, self).__init__(config)
...@@ -1202,53 +1209,6 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1202,53 +1209,6 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
mems=None, perm_mask=None, target_mapping=None, mems=None, perm_mask=None, target_mapping=None,
start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
head_mask=None): head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see XLNet paper for more details).
`attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
but with 1 for real tokens and 0 for padding.
Added for easy compatibility with the BERT model (which uses this negative masking).
You can only uses one among ``input_mask`` and ``attention_mask``
`input_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Returns:
if ``start_positions`` and ``end_positions`` are not ``None``, outputs the total_loss which is the sum of the \
``CrossEntropy`` loss for the start and end token positions.
if ``start_positions`` or ``end_positions`` is ``None``, outputs a tuple of ``start_logits``, ``end_logits``
which are the logits respectively for the start and end position tokens of shape \
[batch_size, sequence_length].
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
# or
start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
"""
transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask, transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
mems, perm_mask, target_mapping, head_mask) mems, perm_mask, target_mapping, head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment