"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "e02f95b2298997016cd01fdb182442093b34e8d2"
Commit 7511f3dd authored by Lysandre's avatar Lysandre Committed by Lysandre Debut
Browse files

PyTorch CTRL + Style

parent 980211a6
CTRL CTRL
---------------------------------------------------- ----------------------------------------------------
CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior.
Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl), Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl),
you'll be able to convert from TF to our HuggingFace/Transformers format using the you'll be able to convert from TF to our HuggingFace/Transformers format using the
``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_). ``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_).
......
...@@ -38,5 +38,6 @@ class XLMRobertaConfig(RobertaConfig): ...@@ -38,5 +38,6 @@ class XLMRobertaConfig(RobertaConfig):
This class overrides :class:`~transformers.RobertaConfig`. Please check the This class overrides :class:`~transformers.RobertaConfig`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "xlm-roberta" model_type = "xlm-roberta"
...@@ -607,7 +607,6 @@ class AlbertMLMHead(nn.Module): ...@@ -607,7 +607,6 @@ class AlbertMLMHead(nn.Module):
"Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
) )
class AlbertForMaskedLM(AlbertPreTrainedModel): class AlbertForMaskedLM(AlbertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -698,7 +697,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): ...@@ -698,7 +697,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class AlbertForSequenceClassification(AlbertPreTrainedModel): class AlbertForSequenceClassification(AlbertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -794,7 +792,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ...@@ -794,7 +792,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class AlbertForQuestionAnswering(AlbertPreTrainedModel): class AlbertForQuestionAnswering(AlbertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -813,7 +813,6 @@ class BertModel(BertPreTrainedModel): ...@@ -813,7 +813,6 @@ class BertModel(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForPreTraining(BertPreTrainedModel): class BertForPreTraining(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -907,11 +906,8 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -907,11 +906,8 @@ class BertForPreTraining(BertPreTrainedModel):
return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
"""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
)
class BertForMaskedLM(BertPreTrainedModel): class BertForMaskedLM(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -1018,11 +1014,9 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1018,11 +1014,9 @@ class BertForMaskedLM(BertPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """, """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
BERT_START_DOCSTRING,
) )
class BertForNextSentencePrediction(BertPreTrainedModel): class BertForNextSentencePrediction(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -1105,7 +1099,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -1105,7 +1099,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForSequenceClassification(BertPreTrainedModel): class BertForSequenceClassification(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1198,7 +1191,6 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1198,7 +1191,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForMultipleChoice(BertPreTrainedModel): class BertForMultipleChoice(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -1294,7 +1286,6 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1294,7 +1286,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForTokenClassification(BertPreTrainedModel): class BertForTokenClassification(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1386,7 +1377,6 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1386,7 +1377,6 @@ class BertForTokenClassification(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForQuestionAnswering(BertPreTrainedModel): class BertForQuestionAnswering(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(BertForQuestionAnswering, self).__init__(config) super(BertForQuestionAnswering, self).__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -58,19 +58,20 @@ class CamembertModel(RobertaModel): ...@@ -58,19 +58,20 @@ class CamembertModel(RobertaModel):
This class overrides :class:`~transformers.RobertaModel`. Please check the This class overrides :class:`~transformers.RobertaModel`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings( @add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top. """, """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
CAMEMBERT_START_DOCSTRING,
) )
class CamembertForMaskedLM(RobertaForMaskedLM): class CamembertForMaskedLM(RobertaForMaskedLM):
""" """
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
...@@ -85,6 +86,7 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification): ...@@ -85,6 +86,7 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
...@@ -99,9 +101,11 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice): ...@@ -99,9 +101,11 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings( @add_start_docstrings(
"""CamemBERT Model with a token classification head on top (a linear layer on top of """CamemBERT Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
...@@ -112,6 +116,6 @@ class CamembertForTokenClassification(RobertaForTokenClassification): ...@@ -112,6 +116,6 @@ class CamembertForTokenClassification(RobertaForTokenClassification):
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
...@@ -24,7 +24,7 @@ import torch.nn as nn ...@@ -24,7 +24,7 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from .configuration_ctrl import CTRLConfig from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import Conv1D, PreTrainedModel from .modeling_utils import Conv1D, PreTrainedModel
...@@ -184,57 +184,53 @@ class CTRLPreTrainedModel(PreTrainedModel): ...@@ -184,57 +184,53 @@ class CTRLPreTrainedModel(PreTrainedModel):
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
CTRL_START_DOCSTRING = r""" CTRL model was proposed in CTRL_START_DOCSTRING = r"""
`CTRL: A Conditional Transformer Language Model for Controllable Generation`_
by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior. refer to the PyTorch documentation for all matter related to general usage and behavior.
.. _`CTRL: A Conditional Transformer Language Model for Controllable Generation`:
https://www.github.com/salesforce/ctrl
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Parameters: Parameters:
config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
""" """
CTRL_INPUTS_DOCSTRING = r""" Inputs: CTRL_INPUTS_DOCSTRING = r"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Args:
Indices of input sequence tokens in the vocabulary. input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on Indices of input sequence tokens in the vocabulary.
the right rather than the left.
Indices can be obtained using :class:`transformers.CTRLTokenizer`. Indices can be obtained using :class:`transformers.CTRLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
**past**:
list of ``torch.FloatTensor`` (one for each layer): `What are input IDs? <../glossary.html#input-ids>`__
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed. should not be passed as input ids as they have already been computed.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs). `What are attention masks? <../glossary.html#attention-mask>`__
The embeddings from these tokens will be summed with the respective token embeddings. token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). Segment token indices to indicate first and second portions of the inputs.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
`What are token type IDs? <../glossary.html#token-type-ids>`_
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Indices of positions of each input sequence tokens in the position embeddings. Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``. Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
`What are position IDs? <../glossary.html#position-ids>`_
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
""" """
...@@ -243,35 +239,8 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs: ...@@ -243,35 +239,8 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
@add_start_docstrings( @add_start_docstrings(
"The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING, CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
) )
class CTRLModel(CTRLPreTrainedModel): class CTRLModel(CTRLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -310,6 +279,7 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -310,6 +279,7 @@ class CTRLModel(CTRLPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads) self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -320,6 +290,36 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -320,6 +290,36 @@ class CTRLModel(CTRLPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
): ):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`CTRLConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None: elif input_ids is not None:
...@@ -435,50 +435,10 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -435,50 +435,10 @@ class CTRLModel(CTRLPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""The CTRL Model transformer with a language modeling head on top """The CTRL Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
CTRL_START_DOCSTRING, CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
) )
class CTRLLMHeadModel(CTRLPreTrainedModel): class CTRLLMHeadModel(CTRLPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -499,6 +459,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -499,6 +459,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
inputs.update(kwargs) inputs.update(kwargs)
return inputs return inputs
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -510,6 +471,49 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -510,6 +471,49 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
): ):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`~transformers.CTRLConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
Language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
past=past, past=past,
......
...@@ -390,7 +390,6 @@ DISTILBERT_INPUTS_DOCSTRING = r""" ...@@ -390,7 +390,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertModel(DistilBertPreTrainedModel): class DistilBertModel(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -484,11 +483,9 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -484,11 +483,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """, """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
DISTILBERT_START_DOCSTRING,
) )
class DistilBertForMaskedLM(DistilBertPreTrainedModel): class DistilBertForMaskedLM(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -567,7 +564,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -567,7 +564,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertForSequenceClassification(DistilBertPreTrainedModel): class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -645,7 +641,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -645,7 +641,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -745,7 +740,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -745,7 +740,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertForTokenClassification(DistilBertPreTrainedModel): class DistilBertForTokenClassification(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -269,12 +269,6 @@ GPT2_START_DOCSTRING = r""" ...@@ -269,12 +269,6 @@ GPT2_START_DOCSTRING = r"""
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior. refer to the PyTorch documentation for all matter related to general usage and behavior.
.. _`Language Models are Unsupervised Multitask Learners`:
https://openai.com/blog/better-language-models/
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Parameters: Parameters:
config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model. config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the configuration.
...@@ -328,7 +322,6 @@ GPT2_INPUTS_DOCSTRING = r""" ...@@ -328,7 +322,6 @@ GPT2_INPUTS_DOCSTRING = r"""
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class GPT2Model(GPT2PreTrainedModel): class GPT2Model(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
...@@ -514,7 +507,6 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -514,7 +507,6 @@ class GPT2Model(GPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class GPT2LMHeadModel(GPT2PreTrainedModel): class GPT2LMHeadModel(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
...@@ -624,7 +616,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -624,7 +616,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class GPT2DoubleHeadsModel(GPT2PreTrainedModel): class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
config.num_labels = 1 config.num_labels = 1
......
...@@ -338,7 +338,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -338,7 +338,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class OpenAIGPTModel(OpenAIGPTPreTrainedModel): class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -493,7 +492,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -493,7 +492,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = OpenAIGPTModel(config) self.transformer = OpenAIGPTModel(config)
...@@ -587,7 +585,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -587,7 +585,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
......
...@@ -150,6 +150,7 @@ class RobertaModel(BertModel): ...@@ -150,6 +150,7 @@ class RobertaModel(BertModel):
This class overrides :class:`~transformers.BertModel`. Please check the This class overrides :class:`~transformers.BertModel`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = RobertaConfig config_class = RobertaConfig
pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "roberta" base_model_prefix = "roberta"
...@@ -167,9 +168,7 @@ class RobertaModel(BertModel): ...@@ -167,9 +168,7 @@ class RobertaModel(BertModel):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings = value
@add_start_docstrings( @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
"""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
)
class RobertaForMaskedLM(BertPreTrainedModel): class RobertaForMaskedLM(BertPreTrainedModel):
config_class = RobertaConfig config_class = RobertaConfig
pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
...@@ -652,7 +651,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -652,7 +651,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds inputs_embeds=inputs_embeds,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
......
...@@ -560,7 +560,6 @@ ALBERT_INPUTS_DOCSTRING = r""" ...@@ -560,7 +560,6 @@ ALBERT_INPUTS_DOCSTRING = r"""
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertModel(TFAlbertPreTrainedModel): class TFAlbertModel(TFAlbertPreTrainedModel):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
...@@ -705,11 +704,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel): ...@@ -705,11 +704,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
return outputs return outputs
@add_start_docstrings( @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
"""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING
)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs) super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
...@@ -766,7 +762,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -766,7 +762,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs) super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -660,7 +660,6 @@ BERT_INPUTS_DOCSTRING = r""" ...@@ -660,7 +660,6 @@ BERT_INPUTS_DOCSTRING = r"""
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertModel(TFBertPreTrainedModel): class TFBertModel(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
...@@ -711,7 +710,6 @@ class TFBertModel(TFBertPreTrainedModel): ...@@ -711,7 +710,6 @@ class TFBertModel(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForPreTraining(TFBertPreTrainedModel): class TFBertForPreTraining(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -767,11 +765,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -767,11 +765,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
"""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
)
class TFBertForMaskedLM(TFBertPreTrainedModel): class TFBertForMaskedLM(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -822,11 +817,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -822,11 +817,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """, """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
BERT_START_DOCSTRING,
) )
class TFBertForNextSentencePrediction(TFBertPreTrainedModel): class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -879,7 +872,6 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): ...@@ -879,7 +872,6 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForSequenceClassification(TFBertPreTrainedModel): class TFBertForSequenceClassification(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -938,7 +930,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -938,7 +930,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForMultipleChoice(TFBertPreTrainedModel): class TFBertForMultipleChoice(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -1049,7 +1040,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -1049,7 +1040,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForTokenClassification(TFBertPreTrainedModel): class TFBertForTokenClassification(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1108,7 +1098,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1108,7 +1098,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForQuestionAnswering(TFBertPreTrainedModel): class TFBertForQuestionAnswering(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -536,7 +536,6 @@ DISTILBERT_INPUTS_DOCSTRING = r""" ...@@ -536,7 +536,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertModel(TFDistilBertPreTrainedModel): class TFDistilBertModel(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
...@@ -594,11 +593,9 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -594,11 +593,9 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
@add_start_docstrings( @add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """, """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -665,7 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -665,7 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -730,7 +726,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): ...@@ -730,7 +726,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -786,7 +781,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): ...@@ -786,7 +781,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
......
...@@ -444,7 +444,6 @@ GPT2_INPUTS_DOCSTRING = r""" ...@@ -444,7 +444,6 @@ GPT2_INPUTS_DOCSTRING = r"""
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2Model(TFGPT2PreTrainedModel): class TFGPT2Model(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
...@@ -494,7 +493,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -494,7 +493,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
...@@ -557,7 +555,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -557,7 +555,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
config.num_labels = 1 config.num_labels = 1
......
...@@ -427,7 +427,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -427,7 +427,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
...@@ -473,7 +472,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -473,7 +472,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
...@@ -531,7 +529,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -531,7 +529,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
config.num_labels = 1 config.num_labels = 1
......
...@@ -180,7 +180,6 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -180,7 +180,6 @@ ROBERTA_INPUTS_DOCSTRING = r"""
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaModel(TFRobertaPreTrainedModel): class TFRobertaModel(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
...@@ -256,11 +255,8 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -256,11 +255,8 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
return x return x
@add_start_docstrings( @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
"""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
)
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -340,7 +336,6 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): ...@@ -340,7 +336,6 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -394,7 +389,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): ...@@ -394,7 +389,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -687,7 +687,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -687,7 +687,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TFTransfoXLModel(TFTransfoXLPreTrainedModel): class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
...@@ -737,7 +736,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): ...@@ -737,7 +736,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
......
...@@ -571,7 +571,6 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -571,7 +571,6 @@ XLM_INPUTS_DOCSTRING = r"""
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMModel(TFXLMPreTrainedModel): class TFXLMModel(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
...@@ -650,7 +649,6 @@ class TFXLMPredLayer(tf.keras.layers.Layer): ...@@ -650,7 +649,6 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
...@@ -705,7 +703,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -705,7 +703,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMForSequenceClassification(TFXLMPreTrainedModel): class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -760,7 +757,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): ...@@ -760,7 +757,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
......
...@@ -780,7 +780,6 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -780,7 +780,6 @@ XLNET_INPUTS_DOCSTRING = r"""
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetModel(TFXLNetPreTrainedModel): class TFXLNetModel(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
...@@ -830,7 +829,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -830,7 +829,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
...@@ -896,7 +894,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -896,7 +894,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -961,7 +958,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -961,7 +958,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1015,11 +1011,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): ...@@ -1015,11 +1011,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
return outputs # return logits, (mems), (hidden states), (attentions) return outputs # return logits, (mems), (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of @add_start_docstrings(
"""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING) XLNET_START_DOCSTRING,
)
class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
......
...@@ -550,7 +550,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -550,7 +550,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TransfoXLModel(TransfoXLPreTrainedModel): class TransfoXLModel(TransfoXLPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -803,7 +802,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -803,7 +802,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = TransfoXLModel(config) self.transformer = TransfoXLModel(config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment