Unverified Commit 08f534d2 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styling (#8067)

* Important files

* Styling them all

* Revert "Styling them all"

This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e.

* Syling them for realsies

* Fix syntax error

* Fix benchmark_utils

* More fixes

* Fix modeling auto and script

* Remove new line

* Fixes

* More fixes

* Fix more files

* Style

* Add FSMT

* More fixes

* More fixes

* More fixes

* More fixes

* Fixes

* More fixes

* More fixes

* Last fixes

* Make sphinx happy
parent 04a17f85
......@@ -13,8 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Utilities for PyTorch Transformer XL model.
Directly adapted from https://github.com/kimiyoung/transformer-xl.
"""
Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
"""
......@@ -87,15 +87,13 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
"""
Params:
hidden :: [len*bsz x d_proj]
labels :: [len*bsz]
labels :: [len*bsz
Return:
if labels is None:
out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
else:
out :: [(len-1)*bsz] Negative log likelihood
We could replace this implementation by the native PyTorch one
if their's had an option to set bias on all clusters in the native one.
here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out ::
[(len-1)*bsz] Negative log likelihood We could replace this implementation by the native PyTorch one if
their's had an option to set bias on all clusters in the native one. here:
https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
"""
if labels is not None:
......@@ -191,15 +189,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
return out
def log_prob(self, hidden):
r"""Computes log probabilities for all :math:`n\_classes`
From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
r"""
Computes log probabilities for all :math:`n\_classes` From:
https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p
Args:
hidden (Tensor): a minibatch of examples
hidden (Tensor): a minibatch of example
Returns:
log-probabilities of for each class :math:`c`
in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
Shape:
log-probabilities of for each class :math:`c` in range :math:`0 <= c <= n\_classes`, where
:math:`n\_classes` is a parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. Shape:
- Input: :math:`(N, in\_features)`
- Output: :math:`(N, n\_classes)`
"""
......
......@@ -287,8 +287,8 @@ class ModuleUtilsMixin:
Whether or not the attentions scores are computed by chunks or not.
Returns:
:obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]`
or list with :obj:`[None]` for each layer.
:obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
list with :obj:`[None]` for each layer.
"""
if head_mask is not None:
head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
......@@ -358,9 +358,9 @@ class ModuleUtilsMixin:
"""
Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper <https://arxiv.org/pdf/2001.08361.pdf>`__ section
2.1. Should be overriden for transformers with parameter re-use e.g. Albert or Universal Transformers, or
if doing long-range modeling with very high sequence lengths.
tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
<https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overriden for transformers with parameter
re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
Args:
batch_size (:obj:`int`):
......@@ -390,23 +390,24 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
* prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
:class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a
PyTorch model, taking as arguments:
- **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch
model, taking as arguments:
- **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
TensorFlow checkpoint.
- **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated
to the model.
- **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to
the model.
- **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
derived classes of the same architecture adding modules on top of the base model.
- **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore
when loading the model (and avoid unnecessary warnings).
- **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore
when saving the model (useful for keys that aren't trained, but which are deterministic)
- **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore when saving the
model (useful for keys that aren't trained, but which are deterministic)
"""
config_class = None
......@@ -684,9 +685,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
Arguments:
heads_to_prune (:obj:`Dict[int, List[int]]`):
Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list
of heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will
prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
0 and 2 on layer 1 and heads 2 and 3 on layer 2.
"""
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
for layer, heads in heads_to_prune.items():
......@@ -743,8 +744,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
r"""
Instantiate a pretrained pytorch model from a pre-trained model configuration.
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated).
To train the model, you should first set it back in training mode with ``model.train()``.
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To
train the model, you should first set it back in training mode with ``model.train()``.
The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
......@@ -806,21 +807,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
proxies (:obj:`Dict[str, str], `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.,
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
request.
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
messages.
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try doanloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
mirror(:obj:`str`, `optional`, defaults to :obj:`None`):
Mirror source to accelerate downloads in China. If you are from China and have an accessibility problem,
you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. Please
refer to the mirror site for more information.
Mirror source to accelerate downloads in China. If you are from China and have an accessibility
problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
Please refer to the mirror site for more information.
kwargs (remaining dictionary of keyword arguments, `optional`):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
:obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
......@@ -1142,8 +1141,8 @@ class PoolerStartLogits(nn.Module):
hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
The final hidden states of the model.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
1.0 means token should be masked.
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
should be masked.
Returns:
:obj:`torch.FloatTensor`: The start logits for SQuAD.
......@@ -1192,8 +1191,8 @@ class PoolerEndLogits(nn.Module):
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
The position of the first token for the labeled span.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
1.0 means token should be masked.
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
should be masked.
.. note::
......@@ -1296,13 +1295,15 @@ class SquadHeadOutput(ModelOutput):
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
(beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
......@@ -1361,8 +1362,8 @@ class SQuADHead(nn.Module):
is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Whether the question has a possible answer in the paragraph or not.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
1.0 means token should be masked.
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
should be masked.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
......@@ -1441,8 +1442,8 @@ class SequenceSummary(nn.Module):
Args:
config (:class:`~transformers.PretrainedConfig`):
The config used by the model. Relevant arguments in the config class of the model are (refer to the
actual config class of your model for the default values it uses):
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
config class of your model for the default values it uses):
- **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
......@@ -1455,7 +1456,7 @@ class SequenceSummary(nn.Module):
- **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
- **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
:obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
- **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
- **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
output, another string or :obj:`None` will add no activation.
- **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
activation.
......@@ -1618,8 +1619,8 @@ def prune_layer(
dim (:obj:`int`, `optional`): The dimension on which to keep the indices.
Returns:
:obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`:
The pruned layer as a new layer with :obj:`requires_grad=True`.
:obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with
:obj:`requires_grad=True`.
"""
if isinstance(layer, nn.Linear):
return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
......@@ -1647,7 +1648,8 @@ def apply_chunking_to_forward(
chunk_dim (:obj:`int`):
The dimension over which the :obj:`input_tensors` should be chunked.
input_tensors (:obj:`Tuple[torch.Tensor]`):
The input tensors of ``forward_fn`` which will be chunked.
The input tensors of ``forward_fn`` which will be chunked
Returns:
:obj:`torch.Tensor`: A tensor with the same shape as the :obj:`foward_fn` would have given if applied`.
......
......@@ -12,7 +12,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch XLM model.
"""
PyTorch XLM model.
"""
......@@ -228,8 +229,9 @@ class TransformerFFN(nn.Module):
class XLMPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = XLMConfig
......@@ -278,7 +280,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
(beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
......@@ -289,8 +292,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -312,14 +315,15 @@ XLM_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior.
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
"""
XLM_INPUTS_DOCSTRING = r"""
......@@ -327,45 +331,43 @@ XLM_INPUTS_DOCSTRING = r"""
input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.XLMTokenizer`.
See :meth:`transformers.PreTrainedTokenizer.encode` and
:meth:`transformers.PreTrainedTokenizer.__call__` for details.
Indices can be obtained using :class:`~transformers.XLMTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__
langs (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
A parallel sequence of tokens to be used to indicate the language of each token in the input.
Indices are languages ids which can be obtained from the language names by using two conversion mappings
provided in the configuration of the model (only provided for multilingual models).
More precisely, the `language name to language id` mapping is in :obj:`model.config.lang2id` (which is a
dictionary strring to int) and the `language id to language name` mapping is in :obj:`model.config.id2lang`
(dictionary int to string).
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
languages ids which can be obtained from the language names by using two conversion mappings provided in
the configuration of the model (only provided for multilingual models). More precisely, the `language name
to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
`language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``:
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
1]``:
- 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`__
position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
config.max_position_embeddings - 1]``.
`What are position IDs? <../glossary.html#position-ids>`__
lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Length of each sentence that can be used to avoid performing attention on padding token indices.
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
Indices selected in ``[0, ..., input_ids.size(-1)]``.
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
``[0, ..., input_ids.size(-1)]``.
cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the
attention blocks) as computed by the model (see :obj:`cache` output below). Can be used to speed up
......@@ -374,8 +376,7 @@ XLM_INPUTS_DOCSTRING = r"""
The dictionary object will be modified in-place during the forward pass to add newly computed
hidden-states.
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
......@@ -478,9 +479,9 @@ class XLMModel(XLMPreTrainedModel):
self.embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads)
......@@ -672,8 +673,10 @@ class XLMPredLayer(nn.Module):
@add_start_docstrings(
"""The XLM Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
"""
The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
XLM_START_DOCSTRING,
)
class XLMWithLMHeadModel(XLMPreTrainedModel):
......@@ -726,11 +729,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -764,8 +765,10 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
@add_start_docstrings(
"""XLM Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
"""
XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
XLM_START_DOCSTRING,
)
class XLMForSequenceClassification(XLMPreTrainedModel):
......@@ -803,9 +806,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -851,8 +853,10 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
@add_start_docstrings(
"""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
"""
XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_START_DOCSTRING,
)
class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
......@@ -891,12 +895,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -953,8 +957,10 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
@add_start_docstrings(
"""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
"""
XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_START_DOCSTRING,
)
class XLMForQuestionAnswering(XLMPreTrainedModel):
......@@ -991,19 +997,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
Labels for position (index) of the classification token to use as input for computing plausibility of the
answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
1.0 means token should be masked. 0.0 mean token is not masked.
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
masked. 0.0 mean token is not masked.
Returns:
......@@ -1067,8 +1074,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
@add_start_docstrings(
"""XLM Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
"""
XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
XLM_START_DOCSTRING,
)
class XLMForTokenClassification(XLMPreTrainedModel):
......@@ -1107,8 +1116,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
1]``.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1159,8 +1168,10 @@ class XLMForTokenClassification(XLMPreTrainedModel):
@add_start_docstrings(
"""XLM Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
"""
XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
XLM_START_DOCSTRING,
)
class XLMForMultipleChoice(XLMPreTrainedModel):
......@@ -1198,9 +1209,9 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
of the input tensors. (See :obj:`input_ids` above)
Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
:obj:`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
......@@ -37,8 +37,8 @@ XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
class XLMProphetNetEncoder(ProphetNetEncoder):
r"""
This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the superclass for the appropriate
documentation alongside usage examples.
Example::
......@@ -59,8 +59,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder):
class XLMProphetNetDecoder(ProphetNetDecoder):
r"""
This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the superclass for the appropriate
documentation alongside usage examples.
Example::
......@@ -81,8 +81,8 @@ class XLMProphetNetDecoder(ProphetNetDecoder):
class XLMProphetNetModel(ProphetNetModel):
r"""
This class overrides :class:`~transformers.ProphetNetModel`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.ProphetNetModel`. Please check the superclass for the appropriate
documentation alongside usage examples.
Example::
......@@ -104,8 +104,8 @@ class XLMProphetNetModel(ProphetNetModel):
class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
r"""
This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the superclass for the
appropriate documentation alongside usage examples.
Example::
......@@ -127,8 +127,8 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
r"""
This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the superclass for the appropriate
documentation alongside usage examples.
Example::
......
......@@ -48,14 +48,15 @@ XLM_ROBERTA_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior.
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
"""
......@@ -65,8 +66,8 @@ XLM_ROBERTA_START_DOCSTRING = r"""
)
class XLMRobertaModel(RobertaModel):
"""
This class overrides :class:`~transformers.RobertaModel`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
......@@ -78,8 +79,8 @@ class XLMRobertaModel(RobertaModel):
)
class XLMRobertaForCausalLM(RobertaForCausalLM):
"""
This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
......@@ -91,64 +92,72 @@ class XLMRobertaForCausalLM(RobertaForCausalLM):
)
class XLMRobertaForMaskedLM(RobertaForMaskedLM):
"""
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
@add_start_docstrings(
"""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
on top of the pooled output) e.g. for GLUE tasks. """,
"""
XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
XLM_ROBERTA_START_DOCSTRING,
)
class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
"""
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
@add_start_docstrings(
"""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
"""
XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
""",
XLM_ROBERTA_START_DOCSTRING,
)
class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
"""
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
@add_start_docstrings(
"""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
"""
XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
XLM_ROBERTA_START_DOCSTRING,
)
class XLMRobertaForTokenClassification(RobertaForTokenClassification):
"""
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
@add_start_docstrings(
"""XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
"""
XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_ROBERTA_START_DOCSTRING,
)
class XLMRobertaForQuestionAnswering(RobertaForQuestionAnswering):
"""
This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the
superclass for the appropriate documentation alongside usage examples.
This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
appropriate documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
......@@ -13,7 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch XLNet model.
"""
PyTorch XLNet model.
"""
......@@ -58,9 +59,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
"""A map of modules from TF to PyTorch.
I use a map to keep the PyTorch model as
identical to the original PyTorch model as possible.
"""
A map of modules from TF to PyTorch. I use a map to keep the PyTorch model as identical to the original PyTorch
model as possible.
"""
tf_to_pt_map = {}
......@@ -541,8 +542,9 @@ class XLNetLayer(nn.Module):
class XLNetPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = XLNetConfig
......@@ -598,8 +600,8 @@ class XLNetModelOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -634,8 +636,8 @@ class XLNetLMHeadModelOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -668,8 +670,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -702,8 +704,8 @@ class XLNetForTokenClassificationOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -738,8 +740,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -774,8 +776,8 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -796,13 +798,15 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
(beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
......@@ -817,8 +821,8 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
......@@ -841,14 +845,15 @@ XLNET_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior.
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
"""
XLNET_INPUTS_DOCSTRING = r"""
......@@ -856,14 +861,13 @@ XLNET_INPUTS_DOCSTRING = r"""
input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`transformers.XLNetTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.__call__` for details.
Indices can be obtained using :class:`transformers.XLNetTokenizer`. See
:func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
......@@ -871,8 +875,8 @@ XLNET_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (see :obj:`mems` output below) . Can be used to speed up sequential
decoding. The token ids which have their past given to this model should not be passed as
:obj:`input_ids` as they have already been computed.
decoding. The token ids which have their past given to this model should not be passed as :obj:`input_ids`
as they have already been computed.
:obj::obj:`use_cache` has to be set to :obj:`True` to make use of :obj:`mems`.
perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`):
......@@ -881,24 +885,23 @@ XLNET_INPUTS_DOCSTRING = r"""
- if ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
- if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
If not set, each token attends to all the others (full bidirectional attention).
Only used during pretraining (to define factorization order) or for sequential decoding (generation).
If not set, each token attends to all the others (full bidirectional attention). Only used during
pretraining (to define factorization order) or for sequential decoding (generation).
target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`):
Mask to indicate the output tokens to use.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
Only used during pretraining for partial prediction or for sequential decoding (generation).
Mask to indicate the output tokens to use. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k
is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding
(generation).
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``:
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
1]``:
- 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`__
input_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
Mask to avoid performing attention on padding token indices.
Negative of :obj:`attention_mask`, i.e. with 0 for real tokens and 1 for padding which is kept for
compatibility with the original code base.
Mask to avoid performing attention on padding token indices. Negative of :obj:`attention_mask`, i.e. with 0
for real tokens and 1 for padding which is kept for compatibility with the original code base.
Mask values selected in ``[0, 1]``:
......@@ -907,8 +910,7 @@ XLNET_INPUTS_DOCSTRING = r"""
You can only uses one of :obj:`input_mask` and :obj:`attention_mask`.
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
......@@ -1279,8 +1281,9 @@ class XLNetModel(XLNetPreTrainedModel):
@add_start_docstrings(
"""XLNet Model with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
"""
XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
""",
XLNET_START_DOCSTRING,
)
class XLNetLMHeadModel(XLNetPreTrainedModel):
......@@ -1360,18 +1363,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`):
Labels for masked language modeling.
:obj:`num_predict` corresponds to :obj:`target_mapping.shape[1]`. If :obj:`target_mapping` is :obj`None`,
then :obj:`num_predict` corresponds to :obj:`sequence_length`.
Labels for masked language modeling. :obj:`num_predict` corresponds to :obj:`target_mapping.shape[1]`. If
:obj:`target_mapping` is :obj`None`, then :obj:`num_predict` corresponds to :obj:`sequence_length`.
The labels should correspond to the masked input words that should be predicted and depends on
:obj:`target_mapping`. Note in order to perform standard auto-regressive language modeling a
`<mask>` token has to be added to the :obj:`input_ids` (see the :obj:`prepare_inputs_for_generation`
function and examples below)
:obj:`target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token
has to be added to the :obj:`input_ids` (see the :obj:`prepare_inputs_for_generation` function and examples
below)
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored, the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored, the
loss is only computed for labels in ``[0, ..., config.vocab_size]``
Return:
......@@ -1447,8 +1448,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
@add_start_docstrings(
"""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
"""
XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
XLNET_START_DOCSTRING,
)
class XLNetForSequenceClassification(XLNetPreTrainedModel):
......@@ -1488,9 +1491,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1540,8 +1542,10 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
@add_start_docstrings(
"""XLNet Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
"""
XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
XLNET_START_DOCSTRING,
)
class XLNetForTokenClassification(XLNetPreTrainedModel):
......@@ -1580,9 +1584,9 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
......@@ -1635,8 +1639,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
@add_start_docstrings(
"""XLNet Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
"""
XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RACE/SWAG tasks.
""",
XLNET_START_DOCSTRING,
)
class XLNetForMultipleChoice(XLNetPreTrainedModel):
......@@ -1675,9 +1681,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
of the input tensors. (See :obj:`input_ids` above)
Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
:obj:`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
......@@ -1734,8 +1740,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
@add_start_docstrings(
"""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
"""
XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLNET_START_DOCSTRING,
)
class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
......@@ -1776,12 +1784,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
......@@ -1841,8 +1849,10 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
@add_start_docstrings(
"""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
"""
XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLNET_START_DOCSTRING,
)
class XLNetForQuestionAnswering(XLNetPreTrainedModel):
......@@ -1884,19 +1894,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
Labels for position (index) of the classification token to use as input for computing plausibility of the
answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
1.0 means token should be masked. 0.0 mean token is not masked.
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
masked. 0.0 mean token is not masked.
Returns:
......
......@@ -70,8 +70,8 @@ def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: in
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
"""
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0,
after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
......@@ -170,9 +170,8 @@ def get_polynomial_decay_schedule_with_warmup(
optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
"""
Create a schedule with a learning rate that decreases as a polynomial decay
from the initial lr set in the optimizer to end lr defined by `lr_end`,
after a warmup period during which it increases linearly from 0 to the
Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
optimizer to end lr defined by `lr_end`, after a warmup period during which it increases linearly from 0 to the
initial lr set in the optimizer.
Args:
......@@ -189,8 +188,8 @@ def get_polynomial_decay_schedule_with_warmup(
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is
based on the original BERT implementation at
Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
implementation at
https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
Return:
......@@ -218,8 +217,8 @@ def get_polynomial_decay_schedule_with_warmup(
class AdamW(Optimizer):
"""
Implements Adam algorithm with weight decay fix as introduced in
`Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`__.
Implements Adam algorithm with weight decay fix as introduced in `Decoupled Weight Decay Regularization
<https://arxiv.org/abs/1711.05101>`__.
Parameters:
params (:obj:`Iterable[torch.nn.parameter.Parameter]`):
......@@ -320,12 +319,13 @@ class AdamW(Optimizer):
class Adafactor(Optimizer):
"""
AdaFactor pytorch implementation can be used as a drop in replacement for Adam
original fairseq code: https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
Paper: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` https://arxiv.org/abs/1804.04235
Note that this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and
*warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and `relative_step=False`.
Paper: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` https://arxiv.org/abs/1804.04235 Note that
this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and
*warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
`relative_step=False`.
Arguments:
params (:obj:`Iterable[torch.nn.parameter.Parameter]`):
......@@ -352,6 +352,7 @@ class Adafactor(Optimizer):
This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.
Recommended T5 finetuning settings:
- Scheduled LR warm-up to fixed LR
- disable relative updates
- use clip threshold: https://arxiv.org/abs/2004.14546
......@@ -440,7 +441,9 @@ class Adafactor(Optimizer):
return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
def step(self, closure=None):
"""Performs a single optimization step.
"""
Performs a single optimization step
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
......
......@@ -153,8 +153,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
"""
Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
with the m and v parameters in strange ways as shown in
`Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`__.
with the m and v parameters in strange ways as shown in `Decoupled Weight Decay Regularization
<https://arxiv.org/abs/1711.05101>`__.
Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
to adding the square of the weights to the loss with plain (non-momentum) SGD.
......@@ -169,8 +169,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
epsilon (:obj:`float`, `optional`, defaults to 1e-7):
The epsilon paramenter in Adam, which is a small constant for numerical stability.
amsgrad (:obj:`bool`, `optional`, default to `False`):
Whether to apply AMSGrad varient of this algorithm or not, see
`On the Convergence of Adam and Beyond <https://arxiv.org/abs/1904.09237>`__.
Whether to apply AMSGrad varient of this algorithm or not, see `On the Convergence of Adam and Beyond
<https://arxiv.org/abs/1904.09237>`__.
weight_decay_rate (:obj:`float`, `optional`, defaults to 0):
The weight decay to apply.
include_in_weight_decay (:obj:`List[str]`, `optional`):
......@@ -280,11 +280,10 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
# Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
class GradientAccumulator(object):
"""Gradient accumulation utility.
When used with a distribution strategy, the accumulator should be called in a
replica context. Gradients will be accumulated locally on each replica and
without synchronization. Users should then call ``.gradients``, scale the
gradients if required, and pass the result to ``apply_gradients``.
"""
Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
then call ``.gradients``, scale the gradients if required, and pass the result to ``apply_gradients``.
"""
# We use the ON_READ synchronization policy so that no synchronization is
......
......@@ -128,7 +128,8 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option
"pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
task_options (:obj:`Any`, None)
Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for translation task.
Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
translation task.
Returns
......@@ -239,8 +240,9 @@ class DefaultArgumentHandler(ArgumentHandler):
class PipelineDataFormat:
"""
Base class for all the pipeline supported data format both for reading and writing.
Supported data formats currently includes:
Base class for all the pipeline supported data format both for reading and writing. Supported data formats
currently includes:
- JSON
- CSV
- stdin/stdout (pipe)
......@@ -323,8 +325,8 @@ class PipelineDataFormat:
overwrite=False,
) -> "PipelineDataFormat":
"""
Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending
on :obj:`format`.
Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending on
:obj:`format`.
Args:
format: (:obj:`str`):
......@@ -440,8 +442,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
class PipedPipelineDataFormat(PipelineDataFormat):
"""
Read data from piped input to the python process.
For multi columns data, columns should separated by \t
Read data from piped input to the python process. For multi columns data, columns should separated by \t
If columns are provided, then the output will be a dictionary with {column_x: value_x}
......@@ -517,16 +518,16 @@ PIPELINE_INIT_ARGS = r"""
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
must be installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no
model is provided.
If no framework is specified, will default to the one currently installed. If no framework is specified and
both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
is provided.
task (:obj:`str`, defaults to :obj:`""`):
A task-identifier for the pipeline.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to -1):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model
on the associated CUDA device id.
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
the associated CUDA device id.
binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text.
"""
......@@ -538,8 +539,8 @@ class Pipeline(_ScikitCompat):
The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
different pipelines.
Base class implementing pipelined operations.
Pipeline workflow is defined as a sequence of the following operations:
Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
operations:
Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
......@@ -691,10 +692,12 @@ class Pipeline(_ScikitCompat):
def _forward(self, inputs, return_tensors=False):
"""
Internal framework specific forward dispatching.
Internal framework specific forward dispatching
Args:
inputs: dict holding all the keyworded arguments for required by the model forward method.
return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array
Returns:
Numpy array
"""
......@@ -740,16 +743,16 @@ class FeatureExtractionPipeline(Pipeline):
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
must be installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no
model is provided.
If no framework is specified, will default to the one currently installed. If no framework is specified and
both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
is provided.
task (:obj:`str`, defaults to :obj:`""`):
A task-identifier for the pipeline.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to -1):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model
on the associated CUDA device id.
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
the associated CUDA device id.
"""
def __init__(
......@@ -796,25 +799,23 @@ class TextGenerationPipeline(Pipeline):
task identifier: :obj:`"text-generation"`.
The models that this pipeline can use are models that have been trained with an autoregressive language modeling
objective, which includes the uni-directional models in the library (e.g. gpt2).
See the list of available community models on
`huggingface.co/models <https://huggingface.co/models?filter=causal-lm>`__.
objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available
community models on `huggingface.co/models <https://huggingface.co/models?filter=causal-lm>`__.
"""
# Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
# in https://github.com/rusiaaman/XLNet-gen#methodology
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
XL_PREFIX = """In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
XL_PREFIX = """
In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
begging for his blessing. <eod> </s> <eos>
"""
ALLOWED_MODELS = [
"XLNetLMHeadModel",
......@@ -881,12 +882,11 @@ class TextGenerationPipeline(Pipeline):
prefix (:obj:`str`, `optional`):
Prefix added to prompt.
generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate
method corresponding to your framework `here <./model.html#generative-models>`__).
Additional keyword arguments to pass along to the generate method of the model (see the generate method
corresponding to your framework `here <./model.html#generative-models>`__).
Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
following keys:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
- **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
- **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
......@@ -985,19 +985,19 @@ class TextGenerationPipeline(Pipeline):
)
class TextClassificationPipeline(Pipeline):
"""
Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the
`sequence classification examples <../task_summary.html#sequence-classification>`__ for more information.
Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the `sequence classification
examples <../task_summary.html#sequence-classification>`__ for more information.
This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative
sentiments).
If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run
a softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run a
softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__.
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
the up-to-date list of available models on `huggingface.co/models
<https://huggingface.co/models?filter=text-classification>`__.
"""
def __init__(self, return_all_scores: bool = False, **kwargs):
......@@ -1020,8 +1020,7 @@ class TextClassificationPipeline(Pipeline):
One or several texts (or one list of prompts) to classify.
Return:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the
following keys:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
- **label** (:obj:`str`) -- The label predicted.
- **score** (:obj:`float`) -- The corresponding probability.
......@@ -1085,16 +1084,15 @@ class ZeroShotClassificationPipeline(Pipeline):
language inference) tasks.
Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the
candidate label being valid. Any NLI model can be used as long as the first output logit corresponds to
`contradiction` and the last to `entailment`.
pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the candidate
label being valid. Any NLI model can be used as long as the first output logit corresponds to `contradiction` and
the last to `entailment`.
This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"zero-shot-classification"`.
This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task identifier:
:obj:`"zero-shot-classification"`.
The models that this pipeline can use are models that have been fine-tuned on an NLI task.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?search=nli>`__.
The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
of available models on `huggingface.co/models <https://huggingface.co/models?search=nli>`__.
"""
def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
......@@ -1126,21 +1124,20 @@ class ZeroShotClassificationPipeline(Pipeline):
The set of possible class labels to classify each sequence into. Can be a single label, a string of
comma-separated labels, or a list of labels.
hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This example is {}."`):
The template used to turn each label into an NLI-style hypothesis. This template must include a {}
or similar syntax for the candidate label to be inserted into the template. For example, the default
The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
similar syntax for the candidate label to be inserted into the template. For example, the default
template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed
into the model like :obj:`"<cls> sequence to classify <sep> This example is sports . <sep>"`. The
default template works well in many cases, but it may be worthwhile to experiment with different
templates depending on the task setting.
multi_class (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized
such that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are
considered independent and probabilities are normalized for each candidate by doing a softmax of
the entailment score vs. the contradiction score.
Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized such
that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are considered
independent and probabilities are normalized for each candidate by doing a softmax of the entailment
score vs. the contradiction score.
Return:
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the
following keys:
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
- **sequence** (:obj:`str`) -- The sequence for which this is the output.
- **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
......@@ -1188,15 +1185,14 @@ class ZeroShotClassificationPipeline(Pipeline):
)
class FillMaskPipeline(Pipeline):
"""
Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the
`masked language modeling examples <../task_summary.html#masked-language-modeling>`__ for more information.
Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the `masked language modeling
examples <../task_summary.html#masked-language-modeling>`__ for more information.
This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"fill-mask"`.
This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
identifier: :obj:`"fill-mask"`.
The models that this pipeline can use are models that have been trained with a masked language modeling objective,
which includes the bi-directional models in the library.
See the up-to-date list of available models on
which includes the bi-directional models in the library. See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=masked-lm>`__.
.. note::
......@@ -1262,14 +1258,13 @@ class FillMaskPipeline(Pipeline):
One or several texts (or one list of prompts) with masked tokens.
targets (:obj:`str` or :obj:`List[str]`, `optional`):
When passed, the model will return the scores for the passed token or tokens rather than the top k
predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will
be tokenized and the first resulting token will be used (with a warning).
predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will be
tokenized and the first resulting token will be used (with a warning).
top_k (:obj:`int`, `optional`):
When passed, overrides the number of predictions to return.
Return:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the
following keys:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
- **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction.
- **score** (:obj:`float`) -- The corresponding probability.
......@@ -1369,16 +1364,16 @@ class FillMaskPipeline(Pipeline):
)
class TokenClassificationPipeline(Pipeline):
"""
Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the
`named entity recognition examples <../task_summary.html#named-entity-recognition>`__ for more information.
Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the `named entity recognition
examples <../task_summary.html#named-entity-recognition>`__ for more information.
This token recognition pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location
or miscellaneous).
The models that this pipeline can use are models that have been fine-tuned on a token classification task.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__.
The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
up-to-date list of available models on `huggingface.co/models
<https://huggingface.co/models?filter=token-classification>`__.
"""
default_input_names = "sequences"
......@@ -1560,11 +1555,11 @@ NerPipeline = TokenClassificationPipeline
class QuestionAnsweringArgumentHandler(ArgumentHandler):
"""
QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
to internal :class:`~transformers.SquadExample`.
QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
internal :class:`~transformers.SquadExample`.
QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from
the command-line supplied arguments.
QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from the
command-line supplied arguments.
"""
def __call__(self, *args, **kwargs):
......@@ -1623,15 +1618,15 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
@add_end_docstrings(PIPELINE_INIT_ARGS)
class QuestionAnsweringPipeline(Pipeline):
"""
Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the
`question answering examples <../task_summary.html#question-answering>`__ for more information.
Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the `question answering examples
<../task_summary.html#question-answering>`__ for more information.
This question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"question-answering"`.
The models that this pipeline can use are models that have been fine-tuned on a question answering task.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__.
The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
up-to-date list of available models on `huggingface.co/models
<https://huggingface.co/models?filter=question-answering>`__.
"""
default_input_names = "question,context"
......@@ -1666,9 +1661,8 @@ class QuestionAnsweringPipeline(Pipeline):
question: Union[str, List[str]], context: Union[str, List[str]]
) -> Union[SquadExample, List[SquadExample]]:
"""
QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally.
This helper method encapsulate all the logic for converting question(s) and context(s) to
:class:`~transformers.SquadExample`.
QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally. This helper method
encapsulate all the logic for converting question(s) and context(s) to :class:`~transformers.SquadExample`.
We currently support extractive question answering.
......@@ -1677,8 +1671,8 @@ class QuestionAnsweringPipeline(Pipeline):
context (:obj:`str` or :obj:`List[str]`): The context(s) in which we will look for the answer.
Returns:
One or a list of :class:`~transformers.SquadExample`: The corresponding
:class:`~transformers.SquadExample` grouping question and context.
One or a list of :class:`~transformers.SquadExample`: The corresponding :class:`~transformers.SquadExample`
grouping question and context.
"""
if isinstance(question, list):
return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
......@@ -1693,11 +1687,11 @@ class QuestionAnsweringPipeline(Pipeline):
args (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`):
One or several :class:`~transformers.SquadExample` containing the question and context.
X (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
One or several :class:`~transformers.SquadExample` containing the question and context
(will be treated the same way as if passed as the first positional argument).
One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
the same way as if passed as the first positional argument).
data (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
One or several :class:`~transformers.SquadExample` containing the question and context
(will be treated the same way as if passed as the first positional argument).
One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
the same way as if passed as the first positional argument).
question (:obj:`str` or :obj:`List[str]`):
One or several question(s) (must be used in conjunction with the :obj:`context` argument).
context (:obj:`str` or :obj:`List[str]`):
......@@ -1719,8 +1713,7 @@ class QuestionAnsweringPipeline(Pipeline):
Whether or not we accept impossible as an answer.
Return:
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the
following keys:
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
- **score** (:obj:`float`) -- The probability associated to the answer.
- **start** (:obj:`int`) -- The start index of the answer (in the tokenized version of the input).
......@@ -1825,12 +1818,12 @@ class QuestionAnsweringPipeline(Pipeline):
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
"""
Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be
the actual answer.
Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
actual answer.
In addition, it filters out some unwanted/impossible cases like answer len being greater than
max_answer_len or answer end position being before the starting position.
The method supports output the k-best answer through the topk argument.
In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
answer end position being before the starting position. The method supports output the k-best answer through
the topk argument.
Args:
start (:obj:`np.ndarray`): Individual start probabilities for each token.
......@@ -1866,8 +1859,7 @@ class QuestionAnsweringPipeline(Pipeline):
def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
"""
When decoding from token probabilities, this method maps token indexes to actual word in
the initial context.
When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
Args:
text (:obj:`str`): The actual context to extract the answer from.
......@@ -1914,13 +1906,12 @@ class SummarizationPipeline(Pipeline):
"""
Summarize news articles and other documents.
This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"summarization"`.
This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
identifier: :obj:`"summarization"`.
The models that this pipeline can use are models that have been fine-tuned on a summarization task,
which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. See the up-to-date
list of available models on `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
Usage::
......@@ -1957,17 +1948,16 @@ class SummarizationPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate
method corresponding to your framework `here <./model.html#generative-models>`__).
Additional keyword arguments to pass along to the generate method of the model (see the generate method
corresponding to your framework `here <./model.html#generative-models>`__).
Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
following keys:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
- **summary_text** (:obj:`str`, present when ``return_text=True``) -- The summary of the corresponding
input.
- **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
-- The token ids of the summary.
- **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) --
The token ids of the summary.
"""
assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
assert len(documents) > 0, "Please provide a document to summarize"
......@@ -2043,12 +2033,12 @@ class TranslationPipeline(Pipeline):
"""
Translates from one language to another.
This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"translation_xx_to_yy"`.
This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
identifier: :obj:`"translation_xx_to_yy"`.
The models that this pipeline can use are models that have been fine-tuned on a translation task.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=translation>`__.
The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
up-to-date list of available models on `huggingface.co/models
<https://huggingface.co/models?filter=translation>`__.
Usage::
en_fr_translator = pipeline("translation_en_to_fr")
......@@ -2078,12 +2068,11 @@ class TranslationPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate
method corresponding to your framework `here <./model.html#generative-models>`__).
Additional keyword arguments to pass along to the generate method of the model (see the generate method
corresponding to your framework `here <./model.html#generative-models>`__).
Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
following keys:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
- **translation_text** (:obj:`str`, present when ``return_text=True``) -- The translation.
- **translation_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
......@@ -2153,12 +2142,11 @@ class Text2TextGenerationPipeline(Pipeline):
"""
Pipeline for text to text generation using seq2seq models.
This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"text2text-generation"`.
This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the
following task identifier: :obj:`"text2text-generation"`.
The models that this pipeline can use are models that have been fine-tuned on a translation task.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=seq2seq>`__.
The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
up-to-date list of available models on `huggingface.co/models <https://huggingface.co/models?filter=seq2seq>`__.
Usage::
......@@ -2191,12 +2179,11 @@ class Text2TextGenerationPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate
method corresponding to your framework `here <./model.html#generative-models>`__).
Additional keyword arguments to pass along to the generate method of the model (see the generate method
corresponding to your framework `here <./model.html#generative-models>`__).
Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
following keys:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
- **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
- **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
......@@ -2346,10 +2333,8 @@ class Conversation:
Return:
:obj:`str`:
Example:
Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114
user >> Going to the movies tonight - any suggestions?
bot >> The Big Lebowski
Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
suggestions? bot >> The Big Lebowski
"""
output = "Conversation id: {} \n".format(self.uuid)
for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):
......@@ -2371,13 +2356,13 @@ class ConversationalPipeline(Pipeline):
"""
Multi-turn conversational pipeline.
This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"conversational"`.
This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
identifier: :obj:`"conversational"`.
The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task,
currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`.
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=conversational>`__.
currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`. See the
up-to-date list of available models on `huggingface.co/models
<https://huggingface.co/models?filter=conversational>`__.
Usage::
......@@ -2419,8 +2404,8 @@ class ConversationalPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate
method corresponding to your framework `here <./model.html#generative-models>`__).
Additional keyword arguments to pass along to the generate method of the model (see the generate method
corresponding to your framework `here <./model.html#generative-models>`__).
Returns:
:class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`: Conversation(s) with
......@@ -2506,8 +2491,9 @@ class ConversationalPipeline(Pipeline):
"""
Cleans the padding history. Padding may be generated in two places when multiple conversations are provided as
an input:
- at the end of the concatenated history and new user input, so that all input to the model have the same
length
length
- at the end of the generated response, as some responses will be longer than others
This method cleans up these padding token so that the history for each conversation is not impacted by the
batching process.
......@@ -2651,8 +2637,8 @@ SUPPORTED_TASKS = {
def check_task(task: str) -> Tuple[Dict, Any]:
"""
Checks an incoming task string, to validate it's correct and return the
default Pipeline and Model classes, and default models if they exist.
Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
default models if they exist.
Args:
task (:obj:`str`):
......@@ -2670,9 +2656,8 @@ def check_task(task: str) -> Tuple[Dict, Any]:
- :obj:`"conversational"`
Returns:
(task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None))
The actual dictionnary required to initialize the pipeline and some
extra task options for parametrized tasks like "translation_XX_to_YY"
(task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None)) The actual dictionnary required to initialize
the pipeline and some extra task options for parametrized tasks like "translation_XX_to_YY"
"""
......@@ -2737,17 +2722,16 @@ def pipeline(
If not provided, the default for the :obj:`task` will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
identifier or an actual pretrained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`.
If not provided, the default for the :obj:`task` will be loaded.
framework (:obj:`str`, `optional`):
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
must be installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no
model is provided.
If no framework is specified, will default to the one currently installed. If no framework is specified and
both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
is provided.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
kwargs:
......
......@@ -75,7 +75,8 @@ class Index:
Returns:
:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`: A tensor of indices of retrieved documents.
:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of retrieved documents.
:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of
retrieved documents.
"""
raise NotImplementedError
......@@ -87,16 +88,17 @@ class Index:
def init_index(self):
"""
A function responsible for loading the index into memory. Should be called only once per training run of a RAG model.
E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load the index.
A function responsible for loading the index into memory. Should be called only once per training run of a RAG
model. E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load
the index.
"""
raise NotImplementedError
class LegacyIndex(Index):
"""
An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR.
We use default faiss index parameters as specified in that repository.
An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR. We use
default faiss index parameters as specified in that repository.
Args:
vector_size (:obj:`int`):
......@@ -234,17 +236,20 @@ class HFIndexBase(Index):
class CanonicalHFIndex(HFIndexBase):
"""
A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``,
we load the pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from the indicated path on disk.
A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``, we load the
pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from
the indicated path on disk.
Args:
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids with ``datasets.list_datasets()``).
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
with ``datasets.list_datasets()``).
dataset_split (:obj:`str`, optional, defaults to ``train``)
Which split of the ``dataset`` to load.
index_name (:obj:`str`, optional, defaults to ``train``)
The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be saved under this name.
The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be
saved under this name.
index_path (:obj:`str`, optional, defaults to ``None``)
The path to the serialized faiss index on disk.
use_dummy_dataset (:obj:`bool`, optional, defaults to ``False``): If True, use the dummy configuration of the dataset for tests.
......@@ -292,14 +297,14 @@ class CanonicalHFIndex(HFIndexBase):
class CustomHFIndex(HFIndexBase):
"""
A wrapper around an instance of :class:`~datasets.Datasets`.
The dataset and the index are both loaded from the indicated paths on disk.
A wrapper around an instance of :class:`~datasets.Datasets`. The dataset and the index are both loaded from the
indicated paths on disk.
Args:
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
dataset_path (:obj:`str`):
The path to the serialized dataset on disk.
The dataset should have 3 columns: title (str), text (str) and embeddings (arrays of dimension vector_size)
The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
embeddings (arrays of dimension vector_size)
index_path (:obj:`str`)
The path to the serialized faiss index on disk.
"""
......@@ -328,17 +333,17 @@ class CustomHFIndex(HFIndexBase):
class RagRetriever:
"""
Retriever used to get documents from vector queries.
It retrieves the documents embeddings as well as the documents contents, and it formats them to be used with a RagModel.
Retriever used to get documents from vector queries. It retrieves the documents embeddings as well as the documents
contents, and it formats them to be used with a RagModel.
Args:
config (:class:`~transformers.RagConfig`):
The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
You can load your own custom dataset with ``config.index_name="custom"`` or use a canonical one (default) from the datasets library
with ``config.index_name="wiki_dpr"`` for example.
The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
``Index`` to build. You can load your own custom dataset with ``config.index_name="custom"`` or use a
canonical one (default) from the datasets library with ``config.index_name="wiki_dpr"`` for example.
question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
The tokenizer that was used to tokenize the question.
It is used to decode the question and then use the generator_tokenizer.
The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
generator_tokenizer.
generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
The tokenizer used for the generator part of the RagModel.
index (:class:`~transformers.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
......@@ -470,8 +475,8 @@ class RagRetriever:
Prefix added at the beginning of each input, typically used with T5-based models.
Return:
:obj:`tuple(tensors)`:
a tuple consisting of two elements: contextualized ``input_ids`` and a compatible ``attention_mask``.
:obj:`tuple(tensors)`: a tuple consisting of two elements: contextualized ``input_ids`` and a compatible
``attention_mask``.
"""
def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
......@@ -542,11 +547,10 @@ class RagRetriever:
The number of docs retrieved per query.
Return:
:obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`:
A tuple with the following objects:
:obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:
- **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The
retrieval embeddings of the retrieved docs per query.
- **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The retrieval
embeddings of the retrieved docs per query.
- **doc_ids** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`) -- The ids of the documents in the
index
- **doc_dicts** (:obj:`List[dict]`): The :obj:`retrieved_doc_embeds` examples per query.
......@@ -581,16 +585,18 @@ class RagRetriever:
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
Output:
:class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following
fields:
- **context_input_ids** -- List of token ids to be fed to a model.
`What are input IDs? <../glossary.html#input-ids>`__
- **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
:obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
- **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
(when :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
`What are attention masks? <../glossary.html#attention-mask>`__
- **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
- **doc_ids** -- List of ids of the retrieved documents
"""
......
......@@ -88,8 +88,8 @@ def is_pipeline_test(test_case):
"""
Decorator marking a test as a pipeline test.
Pipeline tests are skipped by default and we can run only them by setting RUN_PIPELINE_TEST environment variable
to a truthy value and selecting the is_pipeline_test pytest mark.
Pipeline tests are skipped by default and we can run only them by setting RUN_PIPELINE_TEST environment variable to
a truthy value and selecting the is_pipeline_test pytest mark.
"""
if not _run_pipeline_tests:
......@@ -107,8 +107,7 @@ def slow(test_case):
"""
Decorator marking a test as slow.
Slow tests are skipped by default. Set the RUN_SLOW environment variable
to a truthy value to run them.
Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
"""
if not _run_slow_tests:
......@@ -121,9 +120,8 @@ def custom_tokenizers(test_case):
"""
Decorator marking a test for a custom tokenizer.
Custom tokenizers require additional dependencies, and are skipped
by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
to a truthy value to run them.
Custom tokenizers require additional dependencies, and are skipped by default. Set the RUN_CUSTOM_TOKENIZERS
environment variable to a truthy value to run them.
"""
if not _run_custom_tokenizers:
return unittest.skip("test of custom tokenizers")(test_case)
......@@ -201,8 +199,7 @@ def require_torch_multigpu(test_case):
These tests are skipped on a machine without multiple GPUs.
To run *only* the multigpu tests, assuming all test names contain multigpu:
$ pytest -sv ./tests -k "multigpu"
To run *only* the multigpu tests, assuming all test names contain multigpu: $ pytest -sv ./tests -k "multigpu"
"""
if not _torch_available:
return unittest.skip("test requires PyTorch")(test_case)
......@@ -306,8 +303,8 @@ def get_tests_dir(append_path=None):
append_path: optional path to append to the tests dir path
Return:
The full path to the `tests` dir, so that the tests can be invoked from anywhere.
Optionally `append_path` is joined after the `tests` dir the former is provided.
The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
joined after the `tests` dir the former is provided.
"""
# this function caller's __file__
......@@ -344,30 +341,29 @@ def assert_screenout(out, what):
class CaptureStd:
"""Context manager to capture:
stdout, clean it up and make it available via obj.out
stderr, and make it available via obj.err
"""
Context manager to capture:
stdout, clean it up and make it available via obj.out stderr, and make it available via obj.err
init arguments:
- out - capture stdout: True/False, default True
- err - capture stdout: True/False, default True
init arguments: - out - capture stdout: True/False, default True - err - capture stdout: True/False, default
True
Examples::
Examples::
with CaptureStdout() as cs:
print("Secret message")
print(f"captured: {cs.out}")
with CaptureStdout() as cs:
print("Secret message")
print(f"captured: {cs.out}")
import sys
with CaptureStderr() as cs:
print("Warning: ", file=sys.stderr)
print(f"captured: {cs.err}")
import sys
with CaptureStderr() as cs:
print("Warning: ", file=sys.stderr)
print(f"captured: {cs.err}")
# to capture just one of the streams, but not the other
with CaptureStd(err=False) as cs:
print("Secret message")
print(f"captured: {cs.out}")
# but best use the stream-specific subclasses
# to capture just one of the streams, but not the other
with CaptureStd(err=False) as cs:
print("Secret message")
print(f"captured: {cs.out}")
# but best use the stream-specific subclasses
"""
......@@ -436,7 +432,8 @@ class CaptureStderr(CaptureStd):
class CaptureLogger:
"""Context manager to capture `logging` streams
"""
Context manager to capture `logging` streams
Args:
- logger: 'logging` logger object
......@@ -476,13 +473,12 @@ class CaptureLogger:
class TestCasePlus(unittest.TestCase):
"""This class extends `unittest.TestCase` with additional features.
"""
This class extends `unittest.TestCase` with additional features.
Feature 1: Flexible auto-removable temp dirs which are guaranteed to get
removed at the end of test.
Feature 1: Flexible auto-removable temp dirs which are guaranteed to get removed at the end of test.
In all the following scenarios the temp dir will be auto-removed at the end
of test, unless `after=False`.
In all the following scenarios the temp dir will be auto-removed at the end of test, unless `after=False`.
# 1. create a unique temp dir, `tmp_dir` will contain the path to the created temp dir
......@@ -491,38 +487,35 @@ class TestCasePlus(unittest.TestCase):
def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir()
# 2. create a temp dir of my choice and delete it at the end - useful for debug when you want to
# monitor a specific directory
# 2. create a temp dir of my choice and delete it at the end - useful for debug when you want to # monitor a
specific directory
::
def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test")
# 3. create a temp dir of my choice and do not delete it at the end - useful for when you want
# to look at the temp results
# 3. create a temp dir of my choice and do not delete it at the end - useful for when you want # to look at the
temp results
::
def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", after=False)
# 4. create a temp dir of my choice and ensure to delete it right away - useful for when you
# disabled deletion in the previous test run and want to make sure the that tmp dir is empty
# before the new test is run
# 4. create a temp dir of my choice and ensure to delete it right away - useful for when you # disabled deletion in
the previous test run and want to make sure the that tmp dir is empty # before the new test is run
::
def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", before=True)
Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the
project repository checkout are allowed if an explicit `tmp_dir` is used, so
that by mistake no `/tmp` or similar important part of the filesystem will
get nuked. i.e. please always pass paths that start with `./`
Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are
allowed if an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem
will get nuked. i.e. please always pass paths that start with `./`
Note 2: Each test can register multiple temp dirs and they all will get
auto-removed, unless requested otherwise.
Note 2: Each test can register multiple temp dirs and they all will get auto-removed, unless requested otherwise.
"""
......@@ -540,8 +533,8 @@ class TestCasePlus(unittest.TestCase):
delete the tmp dir at the end of the test
Returns:
tmp_dir(:obj:`string`):
either the same value as passed via `tmp_dir` or the path to the auto-created tmp dir
tmp_dir(:obj:`string`): either the same value as passed via `tmp_dir` or the path to the auto-created tmp
dir
"""
if tmp_dir is not None:
# using provided path
......@@ -577,11 +570,10 @@ class TestCasePlus(unittest.TestCase):
def mockenv(**kwargs):
"""this is a convenience wrapper, that allows this:
"""
this is a convenience wrapper, that allows this:
@mockenv(RUN_SLOW=True, USE_TF=False)
def test_something():
run_slow = os.getenv("RUN_SLOW", False)
use_tf = os.getenv("USE_TF", False)
@mockenv(RUN_SLOW=True, USE_TF=False) def test_something(): run_slow = os.getenv("RUN_SLOW", False) use_tf =
os.getenv("USE_TF", False)
"""
return unittest.mock.patch.dict(os.environ, kwargs)
......@@ -78,35 +78,33 @@ class AlbertTokenizer(PreTrainedTokenizer):
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -224,9 +222,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An ALBERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
......@@ -281,8 +278,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
::
......
......@@ -71,10 +71,11 @@ SPIECE_UNDERLINE = "▁"
class AlbertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on
`SentencePiece <https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
<https://github.com/google/sentencepiece>`__. This tokenizer inherits from
:class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this
superclass for more information regarding those methods
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
......@@ -87,31 +88,26 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
.. note:: When building a sequence using special tokens, this is not the token that is used for the
beginning of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes:
modeling. This is the token which the model will try to predict. Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
......@@ -162,9 +158,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An ALBERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
......@@ -219,8 +214,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
::
......
......@@ -221,8 +221,8 @@ SLOW_TOKENIZER_MAPPING = {
class AutoTokenizer:
r"""
This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library
when created with the :meth:`AutoTokenizer.from_pretrained` class method.
This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
created with the :meth:`AutoTokenizer.from_pretrained` class method.
This class cannot be instantiated directly using ``__init__()`` (throws an error).
"""
......@@ -257,8 +257,8 @@ class AutoTokenizer:
using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
``./my_model_directory/``.
- A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``.
(Not applicable to all derived classes)
single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. (Not
applicable to all derived classes)
inputs (additional positional arguments, `optional`):
Will be passed along to the Tokenizer ``__init__()`` method.
config (:class:`~transformers.PreTrainedConfig`, `optional`)
......@@ -273,9 +273,8 @@ class AutoTokenizer:
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
proxies (:obj:`Dict[str, str]`, `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.,
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
request.
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to try to load the fast version of the tokenizer.
kwargs (additional keyword arguments, `optional`):
......
......@@ -44,8 +44,8 @@ class BartTokenizer(RobertaTokenizer):
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
the initialization parameters and other methods.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
......@@ -75,13 +75,13 @@ class BartTokenizer(RobertaTokenizer):
tgt_texts: (:obj:`List[str]`, `optional`):
List of summaries or target language texts.
max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts).
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
Controls the maximum length for encoder inputs (documents to summarize or source language texts). If
left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
is required by one of the truncation/padding parameters. If the model has no specific maximum input
length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries).
If left unset or set to :obj:`None`, this will use the max_length value.
Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or
set to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
......@@ -122,8 +122,8 @@ class BartTokenizer(RobertaTokenizer):
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **labels** -- List of token ids for tgt_texts
The full set of keys ``[input_ids, attention_mask, labels]``,
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
Otherwise, input_ids, attention_mask will be the only keys.
"""
kwargs.pop("src_lang", None)
kwargs.pop("tgt_lang", None)
......
......@@ -70,13 +70,13 @@ class BartTokenizerFast(RobertaTokenizerFast):
tgt_texts: (:obj:`List[str]`, `optional`):
List of summaries or target language texts.
max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts).
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
Controls the maximum length for encoder inputs (documents to summarize or source language texts). If
left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
is required by one of the truncation/padding parameters. If the model has no specific maximum input
length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries).
If left unset or set to :obj:`None`, this will use the max_length value.
Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or
set to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
......@@ -116,11 +116,11 @@ class BartTokenizerFast(RobertaTokenizerFast):
- **input_ids** -- List of token ids to be fed to the encoder.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **decoder_input_ids** -- List of token ids to be fed to the decoder.
- **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the decoder.
This does not include causal mask, which is built by the model.
- **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the
decoder. This does not include causal mask, which is built by the model.
The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``,
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, will only
be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
"""
if max_length is None:
max_length = self.model_max_length
......
......@@ -135,15 +135,14 @@ class BertTokenizer(PreTrainedTokenizer):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
......@@ -250,9 +249,8 @@ class BertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A BERT sequence has the following format:
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
......@@ -307,8 +305,8 @@ class BertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
::
......@@ -383,14 +381,14 @@ class BasicTokenizer(object):
self.strip_accents = strip_accents
def tokenize(self, text, never_split=None):
"""Basic Tokenization of a piece of text.
Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Args:
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
:func:`PreTrainedTokenizer.tokenize`) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
......@@ -512,14 +510,11 @@ class WordpieceTokenizer(object):
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
......
......@@ -130,25 +130,23 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to clean the text before tokenization by removing any control characters and
replacing all whitespaces by the classic one.
Whether or not to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see `this issue
<https://github.com/huggingface/transformers/issues/328>`__).
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
issue <https://github.com/huggingface/transformers/issues/328>`__).
strip_accents: (:obj:`bool`, `optional`):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for :obj:`lowercase` (as in the original BERT).
......@@ -204,9 +202,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A BERT sequence has the following format:
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
......@@ -231,8 +228,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
::
......
......@@ -94,13 +94,13 @@ class BertJapaneseTokenizer(BertTokenizer):
mecab_kwargs=None,
**kwargs
):
"""Constructs a MecabBertTokenizer.
"""
Constructs a MecabBertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
**do_word_tokenize**: (`optional`) boolean (default True)
Whether to do word tokenization.
**do_subword_tokenize**: (`optional`) boolean (default True)
......@@ -205,20 +205,20 @@ class MecabTokenizer:
mecab_dic: Optional[str] = "ipadic",
mecab_option: Optional[str] = None,
):
"""Constructs a MecabTokenizer.
"""
Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lowercase the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of tokens not to split.
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
:func:`PreTrainedTokenizer.tokenize`) List of tokens not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**mecab_dic**: (`optional`) string (default "ipadic")
Name of dictionary to be used for MeCab initialization.
If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`.
Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
set thi option to `None` and modify `mecab_option`.
**mecab_option**: (`optional`) string
String passed to MeCab constructor.
"""
......@@ -306,7 +306,8 @@ class CharacterTokenizer:
"""Runs Character tokenziation."""
def __init__(self, vocab, unk_token, normalize_text=True):
"""Constructs a CharacterTokenizer.
"""
Constructs a CharacterTokenizer.
Args:
**vocab**:
......@@ -321,14 +322,15 @@ class CharacterTokenizer:
self.normalize_text = normalize_text
def tokenize(self, text):
"""Tokenizes a piece of text into characters.
"""
Tokenizes a piece of text into characters.
For example, :obj:`input = "apple""` wil return as output :obj:`["a", "p", "p", "l", "e"]`.
For example:
input = "apple"
output = ["a", "p", "p", "l", "e"]
Args:
text: A single token or whitespace separated tokens.
This should have already been passed through `BasicTokenizer`.
Returns:
A list of characters.
"""
......
......@@ -50,7 +50,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
def get_pairs(word):
"""Return set of symbol pairs in a word.
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
......@@ -83,23 +84,22 @@ class BertweetTokenizer(PreTrainedTokenizer):
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
......@@ -178,9 +178,8 @@ class BertweetTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A BERTweet sequence has the following format:
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERTweet sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
......@@ -236,8 +235,8 @@ class BertweetTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
BERTweet does not make use of token type ids, therefore a list of zeros is returned.
Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
......@@ -411,8 +410,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
def add_from_file(self, f):
"""
Loads a pre-existing dictionary from a text file and adds its symbols
to this instance.
Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
"""
if isinstance(f, str):
try:
......@@ -446,23 +444,17 @@ class BertweetTokenizer(PreTrainedTokenizer):
"""
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
domains and tasks. The basic logic is this:
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
1. The tuple regex_strings defines a list of regular expression
strings.
1. The tuple regex_strings defines a list of regular expression strings.
2. The regex_strings strings are put, in order, into a compiled
regular expression object called word_re.
2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.
3. The tokenization is done by word_re.findall(s), where s is the
user-supplied string, inside the tokenize() method of the class
Tokenizer.
3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
the class Tokenizer.
4. When instantiating Tokenizer objects, there is a single option:
preserve_case. By default, it is set to True. If it is set to
False, then the tokenizer will downcase everything except for
emoticons.
4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
is set to False, then the tokenizer will downcase everything except for emoticons.
"""
......@@ -582,6 +574,7 @@ REGEXPS = (
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
# email addresses
r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
# docstyle-ignore
# Remaining word types:
r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
......@@ -627,28 +620,24 @@ def _str_to_unicode(text, encoding=None, errors="strict"):
def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
"""
Remove entities from text by converting them to their
corresponding unicode character.
Remove entities from text by converting them to their corresponding unicode character.
Args:
text:
A unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8').
keep (list):
List of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
and named entities (such as ``&nbsp;`` or ``&gt;``).
List of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and
``&#hhhh;``) and named entities (such as ``&nbsp;`` or ``&gt;``).
remove_illegal (bool):
If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is".
If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
kept "as is".
Returns: A unicode string with the entities removed.
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
>>> from nltk.tokenize.casual import _replace_html_entities
>>> _replace_html_entities(b'Price: &pound;100')
'Price: \\xa3100'
>>> print(_replace_html_entities(b'Price: &pound;100'))
Price: £100
>>>
>>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: &pound;100')
'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: &pound;100')) Price: £100 >>>
"""
def _convert_entity(match):
......@@ -714,8 +703,8 @@ class TweetTokenizer:
Args:
text: str
Returns: list(str)
A tokenized list of strings; concatenating this list returns the original string if `preserve_case=False`
Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
`preserve_case=False`
"""
# Fix HTML character entities:
text = _replace_html_entities(text)
......@@ -742,8 +731,7 @@ class TweetTokenizer:
def reduce_lengthening(text):
"""
Replace repeated character sequences of length 3 or greater with sequences
of length 3.
Replace repeated character sequences of length 3 or greater with sequences of length 3.
"""
pattern = regex.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1\1", text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment