Unverified Commit 08f534d2 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styling (#8067)

* Important files

* Styling them all

* Revert "Styling them all"

This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e.

* Syling them for realsies

* Fix syntax error

* Fix benchmark_utils

* More fixes

* Fix modeling auto and script

* Remove new line

* Fixes

* More fixes

* Fix more files

* Style

* Add FSMT

* More fixes

* More fixes

* More fixes

* More fixes

* Fixes

* More fixes

* More fixes

* Last fixes

* Make sphinx happy
parent 04a17f85
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Utilities for PyTorch Transformer XL model. """
Directly adapted from https://github.com/kimiyoung/transformer-xl. Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
""" """
...@@ -87,15 +87,13 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): ...@@ -87,15 +87,13 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
""" """
Params: Params:
hidden :: [len*bsz x d_proj] hidden :: [len*bsz x d_proj]
labels :: [len*bsz] labels :: [len*bsz
Return: Return:
if labels is None: if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out ::
out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary [(len-1)*bsz] Negative log likelihood We could replace this implementation by the native PyTorch one if
else: their's had an option to set bias on all clusters in the native one. here:
out :: [(len-1)*bsz] Negative log likelihood https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
We could replace this implementation by the native PyTorch one
if their's had an option to set bias on all clusters in the native one.
here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
""" """
if labels is not None: if labels is not None:
...@@ -191,15 +189,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): ...@@ -191,15 +189,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
return out return out
def log_prob(self, hidden): def log_prob(self, hidden):
r"""Computes log probabilities for all :math:`n\_classes` r"""
From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py Computes log probabilities for all :math:`n\_classes` From:
https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p
Args: Args:
hidden (Tensor): a minibatch of examples hidden (Tensor): a minibatch of example
Returns: Returns:
log-probabilities of for each class :math:`c` log-probabilities of for each class :math:`c` in range :math:`0 <= c <= n\_classes`, where
in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a :math:`n\_classes` is a parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. Shape:
parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
Shape:
- Input: :math:`(N, in\_features)` - Input: :math:`(N, in\_features)`
- Output: :math:`(N, n\_classes)` - Output: :math:`(N, n\_classes)`
""" """
......
...@@ -287,8 +287,8 @@ class ModuleUtilsMixin: ...@@ -287,8 +287,8 @@ class ModuleUtilsMixin:
Whether or not the attentions scores are computed by chunks or not. Whether or not the attentions scores are computed by chunks or not.
Returns: Returns:
:obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
or list with :obj:`[None]` for each layer. list with :obj:`[None]` for each layer.
""" """
if head_mask is not None: if head_mask is not None:
head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
...@@ -358,9 +358,9 @@ class ModuleUtilsMixin: ...@@ -358,9 +358,9 @@ class ModuleUtilsMixin:
""" """
Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
batch with this transformer model. Default approximation neglects the quadratic dependency on the number of batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper <https://arxiv.org/pdf/2001.08361.pdf>`__ section tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
2.1. Should be overriden for transformers with parameter re-use e.g. Albert or Universal Transformers, or <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overriden for transformers with parameter
if doing long-range modeling with very high sequence lengths. re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
Args: Args:
batch_size (:obj:`int`): batch_size (:obj:`int`):
...@@ -390,23 +390,24 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): ...@@ -390,23 +390,24 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
* prune heads in the self-attention heads. * prune heads in the self-attention heads.
Class attributes (overridden by derived classes): Class attributes (overridden by derived classes):
- **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
:class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch
PyTorch model, taking as arguments: model, taking as arguments:
- **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
TensorFlow checkpoint. TensorFlow checkpoint.
- **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to
to the model. the model.
- **path** (:obj:`str`) -- A path to the TensorFlow checkpoint. - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
derived classes of the same architecture adding modules on top of the base model. derived classes of the same architecture adding modules on top of the base model.
- **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore - **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore
when loading the model (and avoid unnecessary warnings). when loading the model (and avoid unnecessary warnings).
- **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore - **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore when saving the
when saving the model (useful for keys that aren't trained, but which are deterministic) model (useful for keys that aren't trained, but which are deterministic)
""" """
config_class = None config_class = None
...@@ -684,9 +685,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): ...@@ -684,9 +685,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
Arguments: Arguments:
heads_to_prune (:obj:`Dict[int, List[int]]`): heads_to_prune (:obj:`Dict[int, List[int]]`):
Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
of heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
""" """
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
...@@ -743,8 +744,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): ...@@ -743,8 +744,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
r""" r"""
Instantiate a pretrained pytorch model from a pre-trained model configuration. Instantiate a pretrained pytorch model from a pre-trained model configuration.
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To
To train the model, you should first set it back in training mode with ``model.train()``. train the model, you should first set it back in training mode with ``model.train()``.
The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
...@@ -806,21 +807,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): ...@@ -806,21 +807,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists. file exists.
proxies (:obj:`Dict[str, str], `optional`): proxies (:obj:`Dict[str, str], `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g., A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
request.
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`): output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`): local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try doanloading the model). Whether or not to only look at local files (e.g., not try doanloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`): use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB. our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
mirror(:obj:`str`, `optional`, defaults to :obj:`None`): mirror(:obj:`str`, `optional`, defaults to :obj:`None`):
Mirror source to accelerate downloads in China. If you are from China and have an accessibility problem, Mirror source to accelerate downloads in China. If you are from China and have an accessibility
you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. Please problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
refer to the mirror site for more information. Please refer to the mirror site for more information.
kwargs (remaining dictionary of keyword arguments, `optional`): kwargs (remaining dictionary of keyword arguments, `optional`):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
:obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
...@@ -1142,8 +1141,8 @@ class PoolerStartLogits(nn.Module): ...@@ -1142,8 +1141,8 @@ class PoolerStartLogits(nn.Module):
hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`): hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
The final hidden states of the model. The final hidden states of the model.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`): p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
1.0 means token should be masked. should be masked.
Returns: Returns:
:obj:`torch.FloatTensor`: The start logits for SQuAD. :obj:`torch.FloatTensor`: The start logits for SQuAD.
...@@ -1192,8 +1191,8 @@ class PoolerEndLogits(nn.Module): ...@@ -1192,8 +1191,8 @@ class PoolerEndLogits(nn.Module):
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
The position of the first token for the labeled span. The position of the first token for the labeled span.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`): p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
1.0 means token should be masked. should be masked.
.. note:: .. note::
...@@ -1296,13 +1295,15 @@ class SquadHeadOutput(ModelOutput): ...@@ -1296,13 +1295,15 @@ class SquadHeadOutput(ModelOutput):
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. Classification loss as the sum of start token, end token (and is_impossible if provided) classification
losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search). Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search). Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
(beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
...@@ -1361,8 +1362,8 @@ class SQuADHead(nn.Module): ...@@ -1361,8 +1362,8 @@ class SQuADHead(nn.Module):
is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Whether the question has a possible answer in the paragraph or not. Whether the question has a possible answer in the paragraph or not.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`): p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
1.0 means token should be masked. should be masked.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple. Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
...@@ -1441,8 +1442,8 @@ class SequenceSummary(nn.Module): ...@@ -1441,8 +1442,8 @@ class SequenceSummary(nn.Module):
Args: Args:
config (:class:`~transformers.PretrainedConfig`): config (:class:`~transformers.PretrainedConfig`):
The config used by the model. Relevant arguments in the config class of the model are (refer to the The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
actual config class of your model for the default values it uses): config class of your model for the default values it uses):
- **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are: - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
...@@ -1455,7 +1456,7 @@ class SequenceSummary(nn.Module): ...@@ -1455,7 +1456,7 @@ class SequenceSummary(nn.Module):
- **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction. - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
- **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
:obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`). :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
- **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
output, another string or :obj:`None` will add no activation. output, another string or :obj:`None` will add no activation.
- **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
activation. activation.
...@@ -1618,8 +1619,8 @@ def prune_layer( ...@@ -1618,8 +1619,8 @@ def prune_layer(
dim (:obj:`int`, `optional`): The dimension on which to keep the indices. dim (:obj:`int`, `optional`): The dimension on which to keep the indices.
Returns: Returns:
:obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`: :obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with
The pruned layer as a new layer with :obj:`requires_grad=True`. :obj:`requires_grad=True`.
""" """
if isinstance(layer, nn.Linear): if isinstance(layer, nn.Linear):
return prune_linear_layer(layer, index, dim=0 if dim is None else dim) return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
...@@ -1647,7 +1648,8 @@ def apply_chunking_to_forward( ...@@ -1647,7 +1648,8 @@ def apply_chunking_to_forward(
chunk_dim (:obj:`int`): chunk_dim (:obj:`int`):
The dimension over which the :obj:`input_tensors` should be chunked. The dimension over which the :obj:`input_tensors` should be chunked.
input_tensors (:obj:`Tuple[torch.Tensor]`): input_tensors (:obj:`Tuple[torch.Tensor]`):
The input tensors of ``forward_fn`` which will be chunked. The input tensors of ``forward_fn`` which will be chunked
Returns: Returns:
:obj:`torch.Tensor`: A tensor with the same shape as the :obj:`foward_fn` would have given if applied`. :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`foward_fn` would have given if applied`.
......
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch XLM model. """
PyTorch XLM model.
""" """
...@@ -228,8 +229,9 @@ class TransformerFFN(nn.Module): ...@@ -228,8 +229,9 @@ class TransformerFFN(nn.Module):
class XLMPreTrainedModel(PreTrainedModel): class XLMPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and """
a simple interface for downloading and loading pretrained models. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
""" """
config_class = XLMConfig config_class = XLMConfig
...@@ -278,7 +280,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput): ...@@ -278,7 +280,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search). Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
(beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
...@@ -289,8 +292,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput): ...@@ -289,8 +292,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -312,14 +315,15 @@ XLM_START_DOCSTRING = r""" ...@@ -312,14 +315,15 @@ XLM_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.) pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass. This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
usage and behavior. general usage and behavior.
Parameters: Parameters:
config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
""" """
XLM_INPUTS_DOCSTRING = r""" XLM_INPUTS_DOCSTRING = r"""
...@@ -327,45 +331,43 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -327,45 +331,43 @@ XLM_INPUTS_DOCSTRING = r"""
input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.XLMTokenizer`. Indices can be obtained using :class:`~transformers.XLMTokenizer`. See
See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
:meth:`transformers.PreTrainedTokenizer.__call__` for details. details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**, - 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**. - 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
langs (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): langs (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
A parallel sequence of tokens to be used to indicate the language of each token in the input. A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
Indices are languages ids which can be obtained from the language names by using two conversion mappings languages ids which can be obtained from the language names by using two conversion mappings provided in
provided in the configuration of the model (only provided for multilingual models). the configuration of the model (only provided for multilingual models). More precisely, the `language name
More precisely, the `language name to language id` mapping is in :obj:`model.config.lang2id` (which is a to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
dictionary strring to int) and the `language id to language name` mapping is in :obj:`model.config.id2lang` `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
(dictionary int to string).
See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`. See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs. Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
Indices are selected in ``[0, 1]``: 1]``:
- 0 corresponds to a `sentence A` token, - 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token. - 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`__ `What are token type IDs? <../glossary.html#token-type-ids>`__
position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Indices of positions of each input sequence tokens in the position embeddings. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
Selected in the range ``[0, config.max_position_embeddings - 1]``. config.max_position_embeddings - 1]``.
`What are position IDs? <../glossary.html#position-ids>`__ `What are position IDs? <../glossary.html#position-ids>`__
lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Length of each sentence that can be used to avoid performing attention on padding token indices. Length of each sentence that can be used to avoid performing attention on padding token indices. You can
You can also use `attention_mask` for the same result (see above), kept here for compatbility. also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
Indices selected in ``[0, ..., input_ids.size(-1)]``. ``[0, ..., input_ids.size(-1)]``.
cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`): cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the
attention blocks) as computed by the model (see :obj:`cache` output below). Can be used to speed up attention blocks) as computed by the model (see :obj:`cache` output below). Can be used to speed up
...@@ -374,8 +376,7 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -374,8 +376,7 @@ XLM_INPUTS_DOCSTRING = r"""
The dictionary object will be modified in-place during the forward pass to add newly computed The dictionary object will be modified in-place during the forward pass to add newly computed
hidden-states. hidden-states.
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**, - 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**. - 0 indicates the head is **masked**.
...@@ -478,9 +479,9 @@ class XLMModel(XLMPreTrainedModel): ...@@ -478,9 +479,9 @@ class XLMModel(XLMPreTrainedModel):
self.embeddings = new_embeddings self.embeddings = new_embeddings
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model. """
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
See base class PreTrainedModel class PreTrainedModel
""" """
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads) self.attentions[layer].prune_heads(heads)
...@@ -672,8 +673,10 @@ class XLMPredLayer(nn.Module): ...@@ -672,8 +673,10 @@ class XLMPredLayer(nn.Module):
@add_start_docstrings( @add_start_docstrings(
"""The XLM Model transformer with a language modeling head on top """
(linear layer with weights tied to the input embeddings). """, The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMWithLMHeadModel(XLMPreTrainedModel): class XLMWithLMHeadModel(XLMPreTrainedModel):
...@@ -726,11 +729,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -726,11 +729,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for language modeling. Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
Indices are selected in ``[-100, 0, ..., config.vocab_size]`` ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -764,8 +765,10 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -764,8 +765,10 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLM Model with a sequence classification/regression head on top (a linear layer on top of """
the pooled output) e.g. for GLUE tasks. """, XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForSequenceClassification(XLMPreTrainedModel): class XLMForSequenceClassification(XLMPreTrainedModel):
...@@ -803,9 +806,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -803,9 +806,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -851,8 +853,10 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -851,8 +853,10 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of """
the hidden-states output to compute `span start logits` and `span end logits`). """, XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
...@@ -891,12 +895,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -891,12 +895,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss. Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -953,8 +957,10 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -953,8 +957,10 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of """
the hidden-states output to compute `span start logits` and `span end logits`). """, XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForQuestionAnswering(XLMPreTrainedModel): class XLMForQuestionAnswering(XLMPreTrainedModel):
...@@ -991,19 +997,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -991,19 +997,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss. Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels whether a question has an answer or no answer (SQuAD 2.0) Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer. Labels for position (index) of the classification token to use as input for computing plausibility of the
answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`): p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
1.0 means token should be masked. 0.0 mean token is not masked. masked. 0.0 mean token is not masked.
Returns: Returns:
...@@ -1067,8 +1074,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -1067,8 +1074,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLM Model with a token classification head on top (a linear layer on top of """
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForTokenClassification(XLMPreTrainedModel): class XLMForTokenClassification(XLMPreTrainedModel):
...@@ -1107,8 +1116,8 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1107,8 +1116,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss. Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
Indices should be in ``[0, ..., config.num_labels - 1]``. 1]``.
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -1159,8 +1168,10 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1159,8 +1168,10 @@ class XLMForTokenClassification(XLMPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLM Model with a multiple choice classification head on top (a linear layer on top of """
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForMultipleChoice(XLMPreTrainedModel): class XLMForMultipleChoice(XLMPreTrainedModel):
...@@ -1198,9 +1209,9 @@ class XLMForMultipleChoice(XLMPreTrainedModel): ...@@ -1198,9 +1209,9 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
of the input tensors. (See :obj:`input_ids` above) :obj:`input_ids` above)
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
...@@ -37,8 +37,8 @@ XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -37,8 +37,8 @@ XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
class XLMProphetNetEncoder(ProphetNetEncoder): class XLMProphetNetEncoder(ProphetNetEncoder):
r""" r"""
This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the superclass for the appropriate
superclass for the appropriate documentation alongside usage examples. documentation alongside usage examples.
Example:: Example::
...@@ -59,8 +59,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder): ...@@ -59,8 +59,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder):
class XLMProphetNetDecoder(ProphetNetDecoder): class XLMProphetNetDecoder(ProphetNetDecoder):
r""" r"""
This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the superclass for the appropriate
superclass for the appropriate documentation alongside usage examples. documentation alongside usage examples.
Example:: Example::
...@@ -81,8 +81,8 @@ class XLMProphetNetDecoder(ProphetNetDecoder): ...@@ -81,8 +81,8 @@ class XLMProphetNetDecoder(ProphetNetDecoder):
class XLMProphetNetModel(ProphetNetModel): class XLMProphetNetModel(ProphetNetModel):
r""" r"""
This class overrides :class:`~transformers.ProphetNetModel`. Please check the This class overrides :class:`~transformers.ProphetNetModel`. Please check the superclass for the appropriate
superclass for the appropriate documentation alongside usage examples. documentation alongside usage examples.
Example:: Example::
...@@ -104,8 +104,8 @@ class XLMProphetNetModel(ProphetNetModel): ...@@ -104,8 +104,8 @@ class XLMProphetNetModel(ProphetNetModel):
class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration): class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
r""" r"""
This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the superclass for the
superclass for the appropriate documentation alongside usage examples. appropriate documentation alongside usage examples.
Example:: Example::
...@@ -127,8 +127,8 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration): ...@@ -127,8 +127,8 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
class XLMProphetNetForCausalLM(ProphetNetForCausalLM): class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
r""" r"""
This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the superclass for the appropriate
superclass for the appropriate documentation alongside usage examples. documentation alongside usage examples.
Example:: Example::
......
...@@ -48,14 +48,15 @@ XLM_ROBERTA_START_DOCSTRING = r""" ...@@ -48,14 +48,15 @@ XLM_ROBERTA_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.) pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass. This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
usage and behavior. general usage and behavior.
Parameters: Parameters:
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration. model. Initializing with a config file does not load the weights associated with the model, only the
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
""" """
...@@ -65,8 +66,8 @@ XLM_ROBERTA_START_DOCSTRING = r""" ...@@ -65,8 +66,8 @@ XLM_ROBERTA_START_DOCSTRING = r"""
) )
class XLMRobertaModel(RobertaModel): class XLMRobertaModel(RobertaModel):
""" """
This class overrides :class:`~transformers.RobertaModel`. Please check the This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
superclass for the appropriate documentation alongside usage examples. documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
...@@ -78,8 +79,8 @@ class XLMRobertaModel(RobertaModel): ...@@ -78,8 +79,8 @@ class XLMRobertaModel(RobertaModel):
) )
class XLMRobertaForCausalLM(RobertaForCausalLM): class XLMRobertaForCausalLM(RobertaForCausalLM):
""" """
This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the superclass for the appropriate
superclass for the appropriate documentation alongside usage examples. documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
...@@ -91,64 +92,72 @@ class XLMRobertaForCausalLM(RobertaForCausalLM): ...@@ -91,64 +92,72 @@ class XLMRobertaForCausalLM(RobertaForCausalLM):
) )
class XLMRobertaForMaskedLM(RobertaForMaskedLM): class XLMRobertaForMaskedLM(RobertaForMaskedLM):
""" """
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
superclass for the appropriate documentation alongside usage examples. documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
@add_start_docstrings( @add_start_docstrings(
"""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer """
on top of the pooled output) e.g. for GLUE tasks. """, XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_START_DOCSTRING,
) )
class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
""" """
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
superclass for the appropriate documentation alongside usage examples. appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
@add_start_docstrings( @add_start_docstrings(
"""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of """
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
""",
XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_START_DOCSTRING,
) )
class XLMRobertaForMultipleChoice(RobertaForMultipleChoice): class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
""" """
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
superclass for the appropriate documentation alongside usage examples. appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
@add_start_docstrings( @add_start_docstrings(
"""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of """
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_START_DOCSTRING,
) )
class XLMRobertaForTokenClassification(RobertaForTokenClassification): class XLMRobertaForTokenClassification(RobertaForTokenClassification):
""" """
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
superclass for the appropriate documentation alongside usage examples. appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
@add_start_docstrings( @add_start_docstrings(
"""XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a """
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""", XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_START_DOCSTRING,
) )
class XLMRobertaForQuestionAnswering(RobertaForQuestionAnswering): class XLMRobertaForQuestionAnswering(RobertaForQuestionAnswering):
""" """
This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
superclass for the appropriate documentation alongside usage examples. appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
...@@ -13,7 +13,8 @@ ...@@ -13,7 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch XLNet model. """
PyTorch XLNet model.
""" """
...@@ -58,9 +59,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -58,9 +59,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
"""A map of modules from TF to PyTorch. """
I use a map to keep the PyTorch model as A map of modules from TF to PyTorch. I use a map to keep the PyTorch model as identical to the original PyTorch
identical to the original PyTorch model as possible. model as possible.
""" """
tf_to_pt_map = {} tf_to_pt_map = {}
...@@ -541,8 +542,9 @@ class XLNetLayer(nn.Module): ...@@ -541,8 +542,9 @@ class XLNetLayer(nn.Module):
class XLNetPreTrainedModel(PreTrainedModel): class XLNetPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and """
a simple interface for downloading and loading pretrained models. An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
""" """
config_class = XLNetConfig config_class = XLNetConfig
...@@ -598,8 +600,8 @@ class XLNetModelOutput(ModelOutput): ...@@ -598,8 +600,8 @@ class XLNetModelOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -634,8 +636,8 @@ class XLNetLMHeadModelOutput(ModelOutput): ...@@ -634,8 +636,8 @@ class XLNetLMHeadModelOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -668,8 +670,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput): ...@@ -668,8 +670,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -702,8 +704,8 @@ class XLNetForTokenClassificationOutput(ModelOutput): ...@@ -702,8 +704,8 @@ class XLNetForTokenClassificationOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -738,8 +740,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput): ...@@ -738,8 +740,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -774,8 +776,8 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): ...@@ -774,8 +776,8 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -796,13 +798,15 @@ class XLNetForQuestionAnsweringOutput(ModelOutput): ...@@ -796,13 +798,15 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. Classification loss as the sum of start token, end token (and is_impossible if provided) classification
losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search). Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search). Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
(beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided): cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
...@@ -817,8 +821,8 @@ class XLNetForQuestionAnsweringOutput(ModelOutput): ...@@ -817,8 +821,8 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
...@@ -841,14 +845,15 @@ XLNET_START_DOCSTRING = r""" ...@@ -841,14 +845,15 @@ XLNET_START_DOCSTRING = r"""
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.) pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass. This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
usage and behavior. general usage and behavior.
Parameters: Parameters:
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
""" """
XLNET_INPUTS_DOCSTRING = r""" XLNET_INPUTS_DOCSTRING = r"""
...@@ -856,14 +861,13 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -856,14 +861,13 @@ XLNET_INPUTS_DOCSTRING = r"""
input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`transformers.XLNetTokenizer`. Indices can be obtained using :class:`transformers.XLNetTokenizer`. See
See :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
:func:`transformers.PreTrainedTokenizer.__call__` for details. details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**, - 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**. - 0 for tokens that are **masked**.
...@@ -871,8 +875,8 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -871,8 +875,8 @@ XLNET_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (see :obj:`mems` output below) . Can be used to speed up sequential Contains pre-computed hidden-states (see :obj:`mems` output below) . Can be used to speed up sequential
decoding. The token ids which have their past given to this model should not be passed as decoding. The token ids which have their past given to this model should not be passed as :obj:`input_ids`
:obj:`input_ids` as they have already been computed. as they have already been computed.
:obj::obj:`use_cache` has to be set to :obj:`True` to make use of :obj:`mems`. :obj::obj:`use_cache` has to be set to :obj:`True` to make use of :obj:`mems`.
perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`): perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`):
...@@ -881,24 +885,23 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -881,24 +885,23 @@ XLNET_INPUTS_DOCSTRING = r"""
- if ``perm_mask[k, i, j] = 0``, i attend to j in batch k; - if ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
- if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k. - if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
If not set, each token attends to all the others (full bidirectional attention). If not set, each token attends to all the others (full bidirectional attention). Only used during
Only used during pretraining (to define factorization order) or for sequential decoding (generation). pretraining (to define factorization order) or for sequential decoding (generation).
target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`): target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`):
Mask to indicate the output tokens to use. Mask to indicate the output tokens to use. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding
Only used during pretraining for partial prediction or for sequential decoding (generation). (generation).
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs. Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
Indices are selected in ``[0, 1]``: 1]``:
- 0 corresponds to a `sentence A` token, - 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token. - 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`__ `What are token type IDs? <../glossary.html#token-type-ids>`__
input_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`): input_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices. Negative of :obj:`attention_mask`, i.e. with 0
Negative of :obj:`attention_mask`, i.e. with 0 for real tokens and 1 for padding which is kept for for real tokens and 1 for padding which is kept for compatibility with the original code base.
compatibility with the original code base.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
...@@ -907,8 +910,7 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -907,8 +910,7 @@ XLNET_INPUTS_DOCSTRING = r"""
You can only uses one of :obj:`input_mask` and :obj:`attention_mask`. You can only uses one of :obj:`input_mask` and :obj:`attention_mask`.
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**, - 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**. - 0 indicates the head is **masked**.
...@@ -1279,8 +1281,9 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -1279,8 +1281,9 @@ class XLNetModel(XLNetPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLNet Model with a language modeling head on top """
(linear layer with weights tied to the input embeddings). """, XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
""",
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetLMHeadModel(XLNetPreTrainedModel): class XLNetLMHeadModel(XLNetPreTrainedModel):
...@@ -1360,18 +1363,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1360,18 +1363,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`):
Labels for masked language modeling. Labels for masked language modeling. :obj:`num_predict` corresponds to :obj:`target_mapping.shape[1]`. If
:obj:`num_predict` corresponds to :obj:`target_mapping.shape[1]`. If :obj:`target_mapping` is :obj`None`, :obj:`target_mapping` is :obj`None`, then :obj:`num_predict` corresponds to :obj:`sequence_length`.
then :obj:`num_predict` corresponds to :obj:`sequence_length`.
The labels should correspond to the masked input words that should be predicted and depends on The labels should correspond to the masked input words that should be predicted and depends on
:obj:`target_mapping`. Note in order to perform standard auto-regressive language modeling a :obj:`target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token
`<mask>` token has to be added to the :obj:`input_ids` (see the :obj:`prepare_inputs_for_generation` has to be added to the :obj:`input_ids` (see the :obj:`prepare_inputs_for_generation` function and examples
function and examples below) below)
Indices are selected in ``[-100, 0, ..., config.vocab_size]`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored, the
All labels set to ``-100`` are ignored, the loss is only loss is only computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
Return: Return:
...@@ -1447,8 +1448,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1447,8 +1448,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLNet Model with a sequence classification/regression head on top (a linear layer on top of """
the pooled output) e.g. for GLUE tasks. """, XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForSequenceClassification(XLNetPreTrainedModel): class XLNetForSequenceClassification(XLNetPreTrainedModel):
...@@ -1488,9 +1491,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1488,9 +1491,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
Indices should be in ``[0, ..., config.num_labels - 1]``. config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -1540,8 +1542,10 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1540,8 +1542,10 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLNet Model with a token classification head on top (a linear layer on top of """
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForTokenClassification(XLNetPreTrainedModel): class XLNetForTokenClassification(XLNetPreTrainedModel):
...@@ -1580,9 +1584,9 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1580,9 +1584,9 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
of the input tensors. (see `input_ids` above) `input_ids` above)
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
...@@ -1635,8 +1639,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1635,8 +1639,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLNet Model with a multiple choice classification head on top (a linear layer on top of """
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """, XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RACE/SWAG tasks.
""",
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForMultipleChoice(XLNetPreTrainedModel): class XLNetForMultipleChoice(XLNetPreTrainedModel):
...@@ -1675,9 +1681,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1675,9 +1681,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
of the input tensors. (See :obj:`input_ids` above) :obj:`input_ids` above)
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
...@@ -1734,8 +1740,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1734,8 +1740,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of """
the hidden-states output to compute `span start logits` and `span end logits`). """, XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
...@@ -1776,12 +1784,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1776,12 +1784,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss. Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
...@@ -1841,8 +1849,10 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1841,8 +1849,10 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of """
the hidden-states output to compute `span start logits` and `span end logits`). """, XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForQuestionAnswering(XLNetPreTrainedModel): class XLNetForQuestionAnswering(XLNetPreTrainedModel):
...@@ -1884,19 +1894,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1884,19 +1894,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss. Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
Position outside of the sequence are not taken into account for computing the loss. sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels whether a question has an answer or no answer (SQuAD 2.0) Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer. Labels for position (index) of the classification token to use as input for computing plausibility of the
answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`): p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
1.0 means token should be masked. 0.0 mean token is not masked. masked. 0.0 mean token is not masked.
Returns: Returns:
......
...@@ -70,8 +70,8 @@ def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: in ...@@ -70,8 +70,8 @@ def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: in
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
""" """
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
Args: Args:
optimizer (:class:`~torch.optim.Optimizer`): optimizer (:class:`~torch.optim.Optimizer`):
...@@ -170,9 +170,8 @@ def get_polynomial_decay_schedule_with_warmup( ...@@ -170,9 +170,8 @@ def get_polynomial_decay_schedule_with_warmup(
optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1 optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
): ):
""" """
Create a schedule with a learning rate that decreases as a polynomial decay Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
from the initial lr set in the optimizer to end lr defined by `lr_end`, optimizer to end lr defined by `lr_end`, after a warmup period during which it increases linearly from 0 to the
after a warmup period during which it increases linearly from 0 to the
initial lr set in the optimizer. initial lr set in the optimizer.
Args: Args:
...@@ -189,8 +188,8 @@ def get_polynomial_decay_schedule_with_warmup( ...@@ -189,8 +188,8 @@ def get_polynomial_decay_schedule_with_warmup(
last_epoch (:obj:`int`, `optional`, defaults to -1): last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training. The index of the last epoch when resuming training.
Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
based on the original BERT implementation at implementation at
https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37 https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
Return: Return:
...@@ -218,8 +217,8 @@ def get_polynomial_decay_schedule_with_warmup( ...@@ -218,8 +217,8 @@ def get_polynomial_decay_schedule_with_warmup(
class AdamW(Optimizer): class AdamW(Optimizer):
""" """
Implements Adam algorithm with weight decay fix as introduced in Implements Adam algorithm with weight decay fix as introduced in `Decoupled Weight Decay Regularization
`Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`__. <https://arxiv.org/abs/1711.05101>`__.
Parameters: Parameters:
params (:obj:`Iterable[torch.nn.parameter.Parameter]`): params (:obj:`Iterable[torch.nn.parameter.Parameter]`):
...@@ -320,12 +319,13 @@ class AdamW(Optimizer): ...@@ -320,12 +319,13 @@ class AdamW(Optimizer):
class Adafactor(Optimizer): class Adafactor(Optimizer):
""" """
AdaFactor pytorch implementation can be used as a drop in replacement for Adam AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
original fairseq code: https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
Paper: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` https://arxiv.org/abs/1804.04235 Paper: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` https://arxiv.org/abs/1804.04235 Note that
Note that this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and
*warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and `relative_step=False`. *warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
`relative_step=False`.
Arguments: Arguments:
params (:obj:`Iterable[torch.nn.parameter.Parameter]`): params (:obj:`Iterable[torch.nn.parameter.Parameter]`):
...@@ -352,6 +352,7 @@ class Adafactor(Optimizer): ...@@ -352,6 +352,7 @@ class Adafactor(Optimizer):
This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested. This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.
Recommended T5 finetuning settings: Recommended T5 finetuning settings:
- Scheduled LR warm-up to fixed LR - Scheduled LR warm-up to fixed LR
- disable relative updates - disable relative updates
- use clip threshold: https://arxiv.org/abs/2004.14546 - use clip threshold: https://arxiv.org/abs/2004.14546
...@@ -440,7 +441,9 @@ class Adafactor(Optimizer): ...@@ -440,7 +441,9 @@ class Adafactor(Optimizer):
return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0)) return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
def step(self, closure=None): def step(self, closure=None):
"""Performs a single optimization step. """
Performs a single optimization step
Arguments: Arguments:
closure (callable, optional): A closure that reevaluates the model closure (callable, optional): A closure that reevaluates the model
and returns the loss. and returns the loss.
......
...@@ -153,8 +153,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): ...@@ -153,8 +153,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
""" """
Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
with the m and v parameters in strange ways as shown in with the m and v parameters in strange ways as shown in `Decoupled Weight Decay Regularization
`Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`__. <https://arxiv.org/abs/1711.05101>`__.
Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
to adding the square of the weights to the loss with plain (non-momentum) SGD. to adding the square of the weights to the loss with plain (non-momentum) SGD.
...@@ -169,8 +169,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): ...@@ -169,8 +169,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
epsilon (:obj:`float`, `optional`, defaults to 1e-7): epsilon (:obj:`float`, `optional`, defaults to 1e-7):
The epsilon paramenter in Adam, which is a small constant for numerical stability. The epsilon paramenter in Adam, which is a small constant for numerical stability.
amsgrad (:obj:`bool`, `optional`, default to `False`): amsgrad (:obj:`bool`, `optional`, default to `False`):
Whether to apply AMSGrad varient of this algorithm or not, see Whether to apply AMSGrad varient of this algorithm or not, see `On the Convergence of Adam and Beyond
`On the Convergence of Adam and Beyond <https://arxiv.org/abs/1904.09237>`__. <https://arxiv.org/abs/1904.09237>`__.
weight_decay_rate (:obj:`float`, `optional`, defaults to 0): weight_decay_rate (:obj:`float`, `optional`, defaults to 0):
The weight decay to apply. The weight decay to apply.
include_in_weight_decay (:obj:`List[str]`, `optional`): include_in_weight_decay (:obj:`List[str]`, `optional`):
...@@ -280,11 +280,10 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): ...@@ -280,11 +280,10 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
# Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py # Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
class GradientAccumulator(object): class GradientAccumulator(object):
"""Gradient accumulation utility. """
When used with a distribution strategy, the accumulator should be called in a Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
replica context. Gradients will be accumulated locally on each replica and replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
without synchronization. Users should then call ``.gradients``, scale the then call ``.gradients``, scale the gradients if required, and pass the result to ``apply_gradients``.
gradients if required, and pass the result to ``apply_gradients``.
""" """
# We use the ON_READ synchronization policy so that no synchronization is # We use the ON_READ synchronization policy so that no synchronization is
......
...@@ -128,7 +128,8 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option ...@@ -128,7 +128,8 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option
"pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet. "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
task_options (:obj:`Any`, None) task_options (:obj:`Any`, None)
Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for translation task. Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
translation task.
Returns Returns
...@@ -239,8 +240,9 @@ class DefaultArgumentHandler(ArgumentHandler): ...@@ -239,8 +240,9 @@ class DefaultArgumentHandler(ArgumentHandler):
class PipelineDataFormat: class PipelineDataFormat:
""" """
Base class for all the pipeline supported data format both for reading and writing. Base class for all the pipeline supported data format both for reading and writing. Supported data formats
Supported data formats currently includes: currently includes:
- JSON - JSON
- CSV - CSV
- stdin/stdout (pipe) - stdin/stdout (pipe)
...@@ -323,8 +325,8 @@ class PipelineDataFormat: ...@@ -323,8 +325,8 @@ class PipelineDataFormat:
overwrite=False, overwrite=False,
) -> "PipelineDataFormat": ) -> "PipelineDataFormat":
""" """
Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending on
on :obj:`format`. :obj:`format`.
Args: Args:
format: (:obj:`str`): format: (:obj:`str`):
...@@ -440,8 +442,7 @@ class JsonPipelineDataFormat(PipelineDataFormat): ...@@ -440,8 +442,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
class PipedPipelineDataFormat(PipelineDataFormat): class PipedPipelineDataFormat(PipelineDataFormat):
""" """
Read data from piped input to the python process. Read data from piped input to the python process. For multi columns data, columns should separated by \t
For multi columns data, columns should separated by \t
If columns are provided, then the output will be a dictionary with {column_x: value_x} If columns are provided, then the output will be a dictionary with {column_x: value_x}
...@@ -517,16 +518,16 @@ PIPELINE_INIT_ARGS = r""" ...@@ -517,16 +518,16 @@ PIPELINE_INIT_ARGS = r"""
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
must be installed. must be installed.
If no framework is specified, will default to the one currently installed. If no framework is specified If no framework is specified, will default to the one currently installed. If no framework is specified and
and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
model is provided. is provided.
task (:obj:`str`, defaults to :obj:`""`): task (:obj:`str`, defaults to :obj:`""`):
A task-identifier for the pipeline. A task-identifier for the pipeline.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`): args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
Reference to the object in charge of parsing supplied pipeline parameters. Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to -1): device (:obj:`int`, `optional`, defaults to -1):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
on the associated CUDA device id. the associated CUDA device id.
binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`): binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text. Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text.
""" """
...@@ -538,8 +539,8 @@ class Pipeline(_ScikitCompat): ...@@ -538,8 +539,8 @@ class Pipeline(_ScikitCompat):
The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
different pipelines. different pipelines.
Base class implementing pipelined operations. Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
Pipeline workflow is defined as a sequence of the following operations: operations:
Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
...@@ -691,10 +692,12 @@ class Pipeline(_ScikitCompat): ...@@ -691,10 +692,12 @@ class Pipeline(_ScikitCompat):
def _forward(self, inputs, return_tensors=False): def _forward(self, inputs, return_tensors=False):
""" """
Internal framework specific forward dispatching. Internal framework specific forward dispatching
Args: Args:
inputs: dict holding all the keyworded arguments for required by the model forward method. inputs: dict holding all the keyworded arguments for required by the model forward method.
return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array. return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array
Returns: Returns:
Numpy array Numpy array
""" """
...@@ -740,16 +743,16 @@ class FeatureExtractionPipeline(Pipeline): ...@@ -740,16 +743,16 @@ class FeatureExtractionPipeline(Pipeline):
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
must be installed. must be installed.
If no framework is specified, will default to the one currently installed. If no framework is specified If no framework is specified, will default to the one currently installed. If no framework is specified and
and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
model is provided. is provided.
task (:obj:`str`, defaults to :obj:`""`): task (:obj:`str`, defaults to :obj:`""`):
A task-identifier for the pipeline. A task-identifier for the pipeline.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`): args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
Reference to the object in charge of parsing supplied pipeline parameters. Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to -1): device (:obj:`int`, `optional`, defaults to -1):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
on the associated CUDA device id. the associated CUDA device id.
""" """
def __init__( def __init__(
...@@ -796,25 +799,23 @@ class TextGenerationPipeline(Pipeline): ...@@ -796,25 +799,23 @@ class TextGenerationPipeline(Pipeline):
task identifier: :obj:`"text-generation"`. task identifier: :obj:`"text-generation"`.
The models that this pipeline can use are models that have been trained with an autoregressive language modeling The models that this pipeline can use are models that have been trained with an autoregressive language modeling
objective, which includes the uni-directional models in the library (e.g. gpt2). objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available
See the list of available community models on community models on `huggingface.co/models <https://huggingface.co/models?filter=causal-lm>`__.
`huggingface.co/models <https://huggingface.co/models?filter=causal-lm>`__.
""" """
# Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
# in https://github.com/rusiaaman/XLNet-gen#methodology # in https://github.com/rusiaaman/XLNet-gen#methodology
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
XL_PREFIX = """In 1991, the remains of Russian Tsar Nicholas II and his family XL_PREFIX = """
(except for Alexei and Maria) are discovered. In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
remainder of the story. 1883 Western Siberia, Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
a young Grigori Rasputin is asked by his father and a group of men to perform magic. and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
Rasputin has a vision and denounces one of the men as a horse thief. Although his accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
father initially slaps him for making such an accusation, Rasputin watches as the the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of begging for his blessing. <eod> </s> <eos>
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, """
with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
ALLOWED_MODELS = [ ALLOWED_MODELS = [
"XLNetLMHeadModel", "XLNetLMHeadModel",
...@@ -881,12 +882,11 @@ class TextGenerationPipeline(Pipeline): ...@@ -881,12 +882,11 @@ class TextGenerationPipeline(Pipeline):
prefix (:obj:`str`, `optional`): prefix (:obj:`str`, `optional`):
Prefix added to prompt. Prefix added to prompt.
generate_kwargs: generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate Additional keyword arguments to pass along to the generate method of the model (see the generate method
method corresponding to your framework `here <./model.html#generative-models>`__). corresponding to your framework `here <./model.html#generative-models>`__).
Return: Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
following keys:
- **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text. - **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
- **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
...@@ -985,19 +985,19 @@ class TextGenerationPipeline(Pipeline): ...@@ -985,19 +985,19 @@ class TextGenerationPipeline(Pipeline):
) )
class TextClassificationPipeline(Pipeline): class TextClassificationPipeline(Pipeline):
""" """
Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the `sequence classification
`sequence classification examples <../task_summary.html#sequence-classification>`__ for more information. examples <../task_summary.html#sequence-classification>`__ for more information.
This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative
sentiments). sentiments).
If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run a
a softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result. softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
See the up-to-date list of available models on the up-to-date list of available models on `huggingface.co/models
`huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__. <https://huggingface.co/models?filter=text-classification>`__.
""" """
def __init__(self, return_all_scores: bool = False, **kwargs): def __init__(self, return_all_scores: bool = False, **kwargs):
...@@ -1020,8 +1020,7 @@ class TextClassificationPipeline(Pipeline): ...@@ -1020,8 +1020,7 @@ class TextClassificationPipeline(Pipeline):
One or several texts (or one list of prompts) to classify. One or several texts (or one list of prompts) to classify.
Return: Return:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
following keys:
- **label** (:obj:`str`) -- The label predicted. - **label** (:obj:`str`) -- The label predicted.
- **score** (:obj:`float`) -- The corresponding probability. - **score** (:obj:`float`) -- The corresponding probability.
...@@ -1085,16 +1084,15 @@ class ZeroShotClassificationPipeline(Pipeline): ...@@ -1085,16 +1084,15 @@ class ZeroShotClassificationPipeline(Pipeline):
language inference) tasks. language inference) tasks.
Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the candidate
candidate label being valid. Any NLI model can be used as long as the first output logit corresponds to label being valid. Any NLI model can be used as long as the first output logit corresponds to `contradiction` and
`contradiction` and the last to `entailment`. the last to `entailment`.
This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task identifier:
task identifier: :obj:`"zero-shot-classification"`. :obj:`"zero-shot-classification"`.
The models that this pipeline can use are models that have been fine-tuned on an NLI task. The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
See the up-to-date list of available models on of available models on `huggingface.co/models <https://huggingface.co/models?search=nli>`__.
`huggingface.co/models <https://huggingface.co/models?search=nli>`__.
""" """
def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs): def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
...@@ -1126,21 +1124,20 @@ class ZeroShotClassificationPipeline(Pipeline): ...@@ -1126,21 +1124,20 @@ class ZeroShotClassificationPipeline(Pipeline):
The set of possible class labels to classify each sequence into. Can be a single label, a string of The set of possible class labels to classify each sequence into. Can be a single label, a string of
comma-separated labels, or a list of labels. comma-separated labels, or a list of labels.
hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This example is {}."`): hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This example is {}."`):
The template used to turn each label into an NLI-style hypothesis. This template must include a {} The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
or similar syntax for the candidate label to be inserted into the template. For example, the default similar syntax for the candidate label to be inserted into the template. For example, the default
template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed
into the model like :obj:`"<cls> sequence to classify <sep> This example is sports . <sep>"`. The into the model like :obj:`"<cls> sequence to classify <sep> This example is sports . <sep>"`. The
default template works well in many cases, but it may be worthwhile to experiment with different default template works well in many cases, but it may be worthwhile to experiment with different
templates depending on the task setting. templates depending on the task setting.
multi_class (:obj:`bool`, `optional`, defaults to :obj:`False`): multi_class (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized such
such that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are considered
considered independent and probabilities are normalized for each candidate by doing a softmax of independent and probabilities are normalized for each candidate by doing a softmax of the entailment
the entailment score vs. the contradiction score. score vs. the contradiction score.
Return: Return:
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
following keys:
- **sequence** (:obj:`str`) -- The sequence for which this is the output. - **sequence** (:obj:`str`) -- The sequence for which this is the output.
- **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood. - **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
...@@ -1188,15 +1185,14 @@ class ZeroShotClassificationPipeline(Pipeline): ...@@ -1188,15 +1185,14 @@ class ZeroShotClassificationPipeline(Pipeline):
) )
class FillMaskPipeline(Pipeline): class FillMaskPipeline(Pipeline):
""" """
Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the `masked language modeling
`masked language modeling examples <../task_summary.html#masked-language-modeling>`__ for more information. examples <../task_summary.html#masked-language-modeling>`__ for more information.
This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
task identifier: :obj:`"fill-mask"`. identifier: :obj:`"fill-mask"`.
The models that this pipeline can use are models that have been trained with a masked language modeling objective, The models that this pipeline can use are models that have been trained with a masked language modeling objective,
which includes the bi-directional models in the library. which includes the bi-directional models in the library. See the up-to-date list of available models on
See the up-to-date list of available models on
`huggingface.co/models <https://huggingface.co/models?filter=masked-lm>`__. `huggingface.co/models <https://huggingface.co/models?filter=masked-lm>`__.
.. note:: .. note::
...@@ -1262,14 +1258,13 @@ class FillMaskPipeline(Pipeline): ...@@ -1262,14 +1258,13 @@ class FillMaskPipeline(Pipeline):
One or several texts (or one list of prompts) with masked tokens. One or several texts (or one list of prompts) with masked tokens.
targets (:obj:`str` or :obj:`List[str]`, `optional`): targets (:obj:`str` or :obj:`List[str]`, `optional`):
When passed, the model will return the scores for the passed token or tokens rather than the top k When passed, the model will return the scores for the passed token or tokens rather than the top k
predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will be
be tokenized and the first resulting token will be used (with a warning). tokenized and the first resulting token will be used (with a warning).
top_k (:obj:`int`, `optional`): top_k (:obj:`int`, `optional`):
When passed, overrides the number of predictions to return. When passed, overrides the number of predictions to return.
Return: Return:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
following keys:
- **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction. - **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction.
- **score** (:obj:`float`) -- The corresponding probability. - **score** (:obj:`float`) -- The corresponding probability.
...@@ -1369,16 +1364,16 @@ class FillMaskPipeline(Pipeline): ...@@ -1369,16 +1364,16 @@ class FillMaskPipeline(Pipeline):
) )
class TokenClassificationPipeline(Pipeline): class TokenClassificationPipeline(Pipeline):
""" """
Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the `named entity recognition
`named entity recognition examples <../task_summary.html#named-entity-recognition>`__ for more information. examples <../task_summary.html#named-entity-recognition>`__ for more information.
This token recognition pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This token recognition pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location task identifier: :obj:`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location
or miscellaneous). or miscellaneous).
The models that this pipeline can use are models that have been fine-tuned on a token classification task. The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
See the up-to-date list of available models on up-to-date list of available models on `huggingface.co/models
`huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__. <https://huggingface.co/models?filter=token-classification>`__.
""" """
default_input_names = "sequences" default_input_names = "sequences"
...@@ -1560,11 +1555,11 @@ NerPipeline = TokenClassificationPipeline ...@@ -1560,11 +1555,11 @@ NerPipeline = TokenClassificationPipeline
class QuestionAnsweringArgumentHandler(ArgumentHandler): class QuestionAnsweringArgumentHandler(ArgumentHandler):
""" """
QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
to internal :class:`~transformers.SquadExample`. internal :class:`~transformers.SquadExample`.
QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from the
the command-line supplied arguments. command-line supplied arguments.
""" """
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
...@@ -1623,15 +1618,15 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler): ...@@ -1623,15 +1618,15 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
@add_end_docstrings(PIPELINE_INIT_ARGS) @add_end_docstrings(PIPELINE_INIT_ARGS)
class QuestionAnsweringPipeline(Pipeline): class QuestionAnsweringPipeline(Pipeline):
""" """
Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the `question answering examples
`question answering examples <../task_summary.html#question-answering>`__ for more information. <../task_summary.html#question-answering>`__ for more information.
This question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
task identifier: :obj:`"question-answering"`. task identifier: :obj:`"question-answering"`.
The models that this pipeline can use are models that have been fine-tuned on a question answering task. The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
See the up-to-date list of available models on up-to-date list of available models on `huggingface.co/models
`huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__. <https://huggingface.co/models?filter=question-answering>`__.
""" """
default_input_names = "question,context" default_input_names = "question,context"
...@@ -1666,9 +1661,8 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1666,9 +1661,8 @@ class QuestionAnsweringPipeline(Pipeline):
question: Union[str, List[str]], context: Union[str, List[str]] question: Union[str, List[str]], context: Union[str, List[str]]
) -> Union[SquadExample, List[SquadExample]]: ) -> Union[SquadExample, List[SquadExample]]:
""" """
QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally. QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally. This helper method
This helper method encapsulate all the logic for converting question(s) and context(s) to encapsulate all the logic for converting question(s) and context(s) to :class:`~transformers.SquadExample`.
:class:`~transformers.SquadExample`.
We currently support extractive question answering. We currently support extractive question answering.
...@@ -1677,8 +1671,8 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1677,8 +1671,8 @@ class QuestionAnsweringPipeline(Pipeline):
context (:obj:`str` or :obj:`List[str]`): The context(s) in which we will look for the answer. context (:obj:`str` or :obj:`List[str]`): The context(s) in which we will look for the answer.
Returns: Returns:
One or a list of :class:`~transformers.SquadExample`: The corresponding One or a list of :class:`~transformers.SquadExample`: The corresponding :class:`~transformers.SquadExample`
:class:`~transformers.SquadExample` grouping question and context. grouping question and context.
""" """
if isinstance(question, list): if isinstance(question, list):
return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
...@@ -1693,11 +1687,11 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1693,11 +1687,11 @@ class QuestionAnsweringPipeline(Pipeline):
args (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`): args (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`):
One or several :class:`~transformers.SquadExample` containing the question and context. One or several :class:`~transformers.SquadExample` containing the question and context.
X (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`): X (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
One or several :class:`~transformers.SquadExample` containing the question and context One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
(will be treated the same way as if passed as the first positional argument). the same way as if passed as the first positional argument).
data (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`): data (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
One or several :class:`~transformers.SquadExample` containing the question and context One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
(will be treated the same way as if passed as the first positional argument). the same way as if passed as the first positional argument).
question (:obj:`str` or :obj:`List[str]`): question (:obj:`str` or :obj:`List[str]`):
One or several question(s) (must be used in conjunction with the :obj:`context` argument). One or several question(s) (must be used in conjunction with the :obj:`context` argument).
context (:obj:`str` or :obj:`List[str]`): context (:obj:`str` or :obj:`List[str]`):
...@@ -1719,8 +1713,7 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1719,8 +1713,7 @@ class QuestionAnsweringPipeline(Pipeline):
Whether or not we accept impossible as an answer. Whether or not we accept impossible as an answer.
Return: Return:
A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
following keys:
- **score** (:obj:`float`) -- The probability associated to the answer. - **score** (:obj:`float`) -- The probability associated to the answer.
- **start** (:obj:`int`) -- The start index of the answer (in the tokenized version of the input). - **start** (:obj:`int`) -- The start index of the answer (in the tokenized version of the input).
...@@ -1825,12 +1818,12 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1825,12 +1818,12 @@ class QuestionAnsweringPipeline(Pipeline):
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
""" """
Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
the actual answer. actual answer.
In addition, it filters out some unwanted/impossible cases like answer len being greater than In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
max_answer_len or answer end position being before the starting position. answer end position being before the starting position. The method supports output the k-best answer through
The method supports output the k-best answer through the topk argument. the topk argument.
Args: Args:
start (:obj:`np.ndarray`): Individual start probabilities for each token. start (:obj:`np.ndarray`): Individual start probabilities for each token.
...@@ -1866,8 +1859,7 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1866,8 +1859,7 @@ class QuestionAnsweringPipeline(Pipeline):
def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]: def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
""" """
When decoding from token probabilities, this method maps token indexes to actual word in When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
the initial context.
Args: Args:
text (:obj:`str`): The actual context to extract the answer from. text (:obj:`str`): The actual context to extract the answer from.
...@@ -1914,13 +1906,12 @@ class SummarizationPipeline(Pipeline): ...@@ -1914,13 +1906,12 @@ class SummarizationPipeline(Pipeline):
""" """
Summarize news articles and other documents. Summarize news articles and other documents.
This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
task identifier: :obj:`"summarization"`. identifier: :obj:`"summarization"`.
The models that this pipeline can use are models that have been fine-tuned on a summarization task, The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. See the up-to-date
See the up-to-date list of available models on list of available models on `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
`huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
Usage:: Usage::
...@@ -1957,17 +1948,16 @@ class SummarizationPipeline(Pipeline): ...@@ -1957,17 +1948,16 @@ class SummarizationPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output. Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs: generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate Additional keyword arguments to pass along to the generate method of the model (see the generate method
method corresponding to your framework `here <./model.html#generative-models>`__). corresponding to your framework `here <./model.html#generative-models>`__).
Return: Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
following keys:
- **summary_text** (:obj:`str`, present when ``return_text=True``) -- The summary of the corresponding - **summary_text** (:obj:`str`, present when ``return_text=True``) -- The summary of the corresponding
input. input.
- **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) - **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) --
-- The token ids of the summary. The token ids of the summary.
""" """
assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
assert len(documents) > 0, "Please provide a document to summarize" assert len(documents) > 0, "Please provide a document to summarize"
...@@ -2043,12 +2033,12 @@ class TranslationPipeline(Pipeline): ...@@ -2043,12 +2033,12 @@ class TranslationPipeline(Pipeline):
""" """
Translates from one language to another. Translates from one language to another.
This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
task identifier: :obj:`"translation_xx_to_yy"`. identifier: :obj:`"translation_xx_to_yy"`.
The models that this pipeline can use are models that have been fine-tuned on a translation task. The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
See the up-to-date list of available models on up-to-date list of available models on `huggingface.co/models
`huggingface.co/models <https://huggingface.co/models?filter=translation>`__. <https://huggingface.co/models?filter=translation>`__.
Usage:: Usage::
en_fr_translator = pipeline("translation_en_to_fr") en_fr_translator = pipeline("translation_en_to_fr")
...@@ -2078,12 +2068,11 @@ class TranslationPipeline(Pipeline): ...@@ -2078,12 +2068,11 @@ class TranslationPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output. Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs: generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate Additional keyword arguments to pass along to the generate method of the model (see the generate method
method corresponding to your framework `here <./model.html#generative-models>`__). corresponding to your framework `here <./model.html#generative-models>`__).
Return: Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
following keys:
- **translation_text** (:obj:`str`, present when ``return_text=True``) -- The translation. - **translation_text** (:obj:`str`, present when ``return_text=True``) -- The translation.
- **translation_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) - **translation_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
...@@ -2153,12 +2142,11 @@ class Text2TextGenerationPipeline(Pipeline): ...@@ -2153,12 +2142,11 @@ class Text2TextGenerationPipeline(Pipeline):
""" """
Pipeline for text to text generation using seq2seq models. Pipeline for text to text generation using seq2seq models.
This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the
task identifier: :obj:`"text2text-generation"`. following task identifier: :obj:`"text2text-generation"`.
The models that this pipeline can use are models that have been fine-tuned on a translation task. The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
See the up-to-date list of available models on up-to-date list of available models on `huggingface.co/models <https://huggingface.co/models?filter=seq2seq>`__.
`huggingface.co/models <https://huggingface.co/models?filter=seq2seq>`__.
Usage:: Usage::
...@@ -2191,12 +2179,11 @@ class Text2TextGenerationPipeline(Pipeline): ...@@ -2191,12 +2179,11 @@ class Text2TextGenerationPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output. Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs: generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate Additional keyword arguments to pass along to the generate method of the model (see the generate method
method corresponding to your framework `here <./model.html#generative-models>`__). corresponding to your framework `here <./model.html#generative-models>`__).
Return: Return:
A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
following keys:
- **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text. - **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
- **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
...@@ -2346,10 +2333,8 @@ class Conversation: ...@@ -2346,10 +2333,8 @@ class Conversation:
Return: Return:
:obj:`str`: :obj:`str`:
Example: Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 suggestions? bot >> The Big Lebowski
user >> Going to the movies tonight - any suggestions?
bot >> The Big Lebowski
""" """
output = "Conversation id: {} \n".format(self.uuid) output = "Conversation id: {} \n".format(self.uuid)
for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses): for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):
...@@ -2371,13 +2356,13 @@ class ConversationalPipeline(Pipeline): ...@@ -2371,13 +2356,13 @@ class ConversationalPipeline(Pipeline):
""" """
Multi-turn conversational pipeline. Multi-turn conversational pipeline.
This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
task identifier: :obj:`"conversational"`. identifier: :obj:`"conversational"`.
The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task, The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task,
currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`. currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`. See the
See the up-to-date list of available models on up-to-date list of available models on `huggingface.co/models
`huggingface.co/models <https://huggingface.co/models?filter=conversational>`__. <https://huggingface.co/models?filter=conversational>`__.
Usage:: Usage::
...@@ -2419,8 +2404,8 @@ class ConversationalPipeline(Pipeline): ...@@ -2419,8 +2404,8 @@ class ConversationalPipeline(Pipeline):
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to clean up the potential extra spaces in the text output. Whether or not to clean up the potential extra spaces in the text output.
generate_kwargs: generate_kwargs:
Additional keyword arguments to pass along to the generate method of the model (see the generate Additional keyword arguments to pass along to the generate method of the model (see the generate method
method corresponding to your framework `here <./model.html#generative-models>`__). corresponding to your framework `here <./model.html#generative-models>`__).
Returns: Returns:
:class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`: Conversation(s) with :class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`: Conversation(s) with
...@@ -2506,8 +2491,9 @@ class ConversationalPipeline(Pipeline): ...@@ -2506,8 +2491,9 @@ class ConversationalPipeline(Pipeline):
""" """
Cleans the padding history. Padding may be generated in two places when multiple conversations are provided as Cleans the padding history. Padding may be generated in two places when multiple conversations are provided as
an input: an input:
- at the end of the concatenated history and new user input, so that all input to the model have the same - at the end of the concatenated history and new user input, so that all input to the model have the same
length length
- at the end of the generated response, as some responses will be longer than others - at the end of the generated response, as some responses will be longer than others
This method cleans up these padding token so that the history for each conversation is not impacted by the This method cleans up these padding token so that the history for each conversation is not impacted by the
batching process. batching process.
...@@ -2651,8 +2637,8 @@ SUPPORTED_TASKS = { ...@@ -2651,8 +2637,8 @@ SUPPORTED_TASKS = {
def check_task(task: str) -> Tuple[Dict, Any]: def check_task(task: str) -> Tuple[Dict, Any]:
""" """
Checks an incoming task string, to validate it's correct and return the Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
default Pipeline and Model classes, and default models if they exist. default models if they exist.
Args: Args:
task (:obj:`str`): task (:obj:`str`):
...@@ -2670,9 +2656,8 @@ def check_task(task: str) -> Tuple[Dict, Any]: ...@@ -2670,9 +2656,8 @@ def check_task(task: str) -> Tuple[Dict, Any]:
- :obj:`"conversational"` - :obj:`"conversational"`
Returns: Returns:
(task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None)) (task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None)) The actual dictionnary required to initialize
The actual dictionnary required to initialize the pipeline and some the pipeline and some extra task options for parametrized tasks like "translation_XX_to_YY"
extra task options for parametrized tasks like "translation_XX_to_YY"
""" """
...@@ -2737,17 +2722,16 @@ def pipeline( ...@@ -2737,17 +2722,16 @@ def pipeline(
If not provided, the default for the :obj:`task` will be loaded. If not provided, the default for the :obj:`task` will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`): tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be a model The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
identifier or an actual pretrained tokenizer inheriting from identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`.
:class:`~transformers.PreTrainedTokenizer`.
If not provided, the default for the :obj:`task` will be loaded. If not provided, the default for the :obj:`task` will be loaded.
framework (:obj:`str`, `optional`): framework (:obj:`str`, `optional`):
The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
must be installed. must be installed.
If no framework is specified, will default to the one currently installed. If no framework is specified If no framework is specified, will default to the one currently installed. If no framework is specified and
and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
model is provided. is provided.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`): use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`). Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
kwargs: kwargs:
......
...@@ -75,7 +75,8 @@ class Index: ...@@ -75,7 +75,8 @@ class Index:
Returns: Returns:
:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`: A tensor of indices of retrieved documents. :obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`: A tensor of indices of retrieved documents.
:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of retrieved documents. :obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of
retrieved documents.
""" """
raise NotImplementedError raise NotImplementedError
...@@ -87,16 +88,17 @@ class Index: ...@@ -87,16 +88,17 @@ class Index:
def init_index(self): def init_index(self):
""" """
A function responsible for loading the index into memory. Should be called only once per training run of a RAG model. A function responsible for loading the index into memory. Should be called only once per training run of a RAG
E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load the index. model. E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load
the index.
""" """
raise NotImplementedError raise NotImplementedError
class LegacyIndex(Index): class LegacyIndex(Index):
""" """
An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR. An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR. We use
We use default faiss index parameters as specified in that repository. default faiss index parameters as specified in that repository.
Args: Args:
vector_size (:obj:`int`): vector_size (:obj:`int`):
...@@ -234,17 +236,20 @@ class HFIndexBase(Index): ...@@ -234,17 +236,20 @@ class HFIndexBase(Index):
class CanonicalHFIndex(HFIndexBase): class CanonicalHFIndex(HFIndexBase):
""" """
A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``, A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``, we load the
we load the pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from the indicated path on disk. pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from
the indicated path on disk.
Args: Args:
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``): dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids with ``datasets.list_datasets()``). A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
with ``datasets.list_datasets()``).
dataset_split (:obj:`str`, optional, defaults to ``train``) dataset_split (:obj:`str`, optional, defaults to ``train``)
Which split of the ``dataset`` to load. Which split of the ``dataset`` to load.
index_name (:obj:`str`, optional, defaults to ``train``) index_name (:obj:`str`, optional, defaults to ``train``)
The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be saved under this name. The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be
saved under this name.
index_path (:obj:`str`, optional, defaults to ``None``) index_path (:obj:`str`, optional, defaults to ``None``)
The path to the serialized faiss index on disk. The path to the serialized faiss index on disk.
use_dummy_dataset (:obj:`bool`, optional, defaults to ``False``): If True, use the dummy configuration of the dataset for tests. use_dummy_dataset (:obj:`bool`, optional, defaults to ``False``): If True, use the dummy configuration of the dataset for tests.
...@@ -292,14 +297,14 @@ class CanonicalHFIndex(HFIndexBase): ...@@ -292,14 +297,14 @@ class CanonicalHFIndex(HFIndexBase):
class CustomHFIndex(HFIndexBase): class CustomHFIndex(HFIndexBase):
""" """
A wrapper around an instance of :class:`~datasets.Datasets`. A wrapper around an instance of :class:`~datasets.Datasets`. The dataset and the index are both loaded from the
The dataset and the index are both loaded from the indicated paths on disk. indicated paths on disk.
Args: Args:
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
dataset_path (:obj:`str`): dataset_path (:obj:`str`):
The path to the serialized dataset on disk. The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
The dataset should have 3 columns: title (str), text (str) and embeddings (arrays of dimension vector_size) embeddings (arrays of dimension vector_size)
index_path (:obj:`str`) index_path (:obj:`str`)
The path to the serialized faiss index on disk. The path to the serialized faiss index on disk.
""" """
...@@ -328,17 +333,17 @@ class CustomHFIndex(HFIndexBase): ...@@ -328,17 +333,17 @@ class CustomHFIndex(HFIndexBase):
class RagRetriever: class RagRetriever:
""" """
Retriever used to get documents from vector queries. Retriever used to get documents from vector queries. It retrieves the documents embeddings as well as the documents
It retrieves the documents embeddings as well as the documents contents, and it formats them to be used with a RagModel. contents, and it formats them to be used with a RagModel.
Args: Args:
config (:class:`~transformers.RagConfig`): config (:class:`~transformers.RagConfig`):
The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build. The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
You can load your own custom dataset with ``config.index_name="custom"`` or use a canonical one (default) from the datasets library ``Index`` to build. You can load your own custom dataset with ``config.index_name="custom"`` or use a
with ``config.index_name="wiki_dpr"`` for example. canonical one (default) from the datasets library with ``config.index_name="wiki_dpr"`` for example.
question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`): question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
The tokenizer that was used to tokenize the question. The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
It is used to decode the question and then use the generator_tokenizer. generator_tokenizer.
generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`): generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
The tokenizer used for the generator part of the RagModel. The tokenizer used for the generator part of the RagModel.
index (:class:`~transformers.retrieval_rag.Index`, optional, defaults to the one defined by the configuration): index (:class:`~transformers.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
...@@ -470,8 +475,8 @@ class RagRetriever: ...@@ -470,8 +475,8 @@ class RagRetriever:
Prefix added at the beginning of each input, typically used with T5-based models. Prefix added at the beginning of each input, typically used with T5-based models.
Return: Return:
:obj:`tuple(tensors)`: :obj:`tuple(tensors)`: a tuple consisting of two elements: contextualized ``input_ids`` and a compatible
a tuple consisting of two elements: contextualized ``input_ids`` and a compatible ``attention_mask``. ``attention_mask``.
""" """
def cat_input_and_doc(doc_title, doc_text, input_string, prefix): def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
...@@ -542,11 +547,10 @@ class RagRetriever: ...@@ -542,11 +547,10 @@ class RagRetriever:
The number of docs retrieved per query. The number of docs retrieved per query.
Return: Return:
:obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`: :obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:
A tuple with the following objects:
- **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The - **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The retrieval
retrieval embeddings of the retrieved docs per query. embeddings of the retrieved docs per query.
- **doc_ids** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`) -- The ids of the documents in the - **doc_ids** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`) -- The ids of the documents in the
index index
- **doc_dicts** (:obj:`List[dict]`): The :obj:`retrieved_doc_embeds` examples per query. - **doc_dicts** (:obj:`List[dict]`): The :obj:`retrieved_doc_embeds` examples per query.
...@@ -581,16 +585,18 @@ class RagRetriever: ...@@ -581,16 +585,18 @@ class RagRetriever:
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
Output: Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following
:class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: fields:
- **context_input_ids** -- List of token ids to be fed to a model. - **context_input_ids** -- List of token ids to be fed to a model.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
- **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
:obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`). - **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
(when :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
- **retrieved_doc_embeds** -- List of embeddings of the retrieved documents - **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
- **doc_ids** -- List of ids of the retrieved documents - **doc_ids** -- List of ids of the retrieved documents
""" """
......
...@@ -88,8 +88,8 @@ def is_pipeline_test(test_case): ...@@ -88,8 +88,8 @@ def is_pipeline_test(test_case):
""" """
Decorator marking a test as a pipeline test. Decorator marking a test as a pipeline test.
Pipeline tests are skipped by default and we can run only them by setting RUN_PIPELINE_TEST environment variable Pipeline tests are skipped by default and we can run only them by setting RUN_PIPELINE_TEST environment variable to
to a truthy value and selecting the is_pipeline_test pytest mark. a truthy value and selecting the is_pipeline_test pytest mark.
""" """
if not _run_pipeline_tests: if not _run_pipeline_tests:
...@@ -107,8 +107,7 @@ def slow(test_case): ...@@ -107,8 +107,7 @@ def slow(test_case):
""" """
Decorator marking a test as slow. Decorator marking a test as slow.
Slow tests are skipped by default. Set the RUN_SLOW environment variable Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
to a truthy value to run them.
""" """
if not _run_slow_tests: if not _run_slow_tests:
...@@ -121,9 +120,8 @@ def custom_tokenizers(test_case): ...@@ -121,9 +120,8 @@ def custom_tokenizers(test_case):
""" """
Decorator marking a test for a custom tokenizer. Decorator marking a test for a custom tokenizer.
Custom tokenizers require additional dependencies, and are skipped Custom tokenizers require additional dependencies, and are skipped by default. Set the RUN_CUSTOM_TOKENIZERS
by default. Set the RUN_CUSTOM_TOKENIZERS environment variable environment variable to a truthy value to run them.
to a truthy value to run them.
""" """
if not _run_custom_tokenizers: if not _run_custom_tokenizers:
return unittest.skip("test of custom tokenizers")(test_case) return unittest.skip("test of custom tokenizers")(test_case)
...@@ -201,8 +199,7 @@ def require_torch_multigpu(test_case): ...@@ -201,8 +199,7 @@ def require_torch_multigpu(test_case):
These tests are skipped on a machine without multiple GPUs. These tests are skipped on a machine without multiple GPUs.
To run *only* the multigpu tests, assuming all test names contain multigpu: To run *only* the multigpu tests, assuming all test names contain multigpu: $ pytest -sv ./tests -k "multigpu"
$ pytest -sv ./tests -k "multigpu"
""" """
if not _torch_available: if not _torch_available:
return unittest.skip("test requires PyTorch")(test_case) return unittest.skip("test requires PyTorch")(test_case)
...@@ -306,8 +303,8 @@ def get_tests_dir(append_path=None): ...@@ -306,8 +303,8 @@ def get_tests_dir(append_path=None):
append_path: optional path to append to the tests dir path append_path: optional path to append to the tests dir path
Return: Return:
The full path to the `tests` dir, so that the tests can be invoked from anywhere. The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
Optionally `append_path` is joined after the `tests` dir the former is provided. joined after the `tests` dir the former is provided.
""" """
# this function caller's __file__ # this function caller's __file__
...@@ -344,30 +341,29 @@ def assert_screenout(out, what): ...@@ -344,30 +341,29 @@ def assert_screenout(out, what):
class CaptureStd: class CaptureStd:
"""Context manager to capture: """
stdout, clean it up and make it available via obj.out Context manager to capture:
stderr, and make it available via obj.err stdout, clean it up and make it available via obj.out stderr, and make it available via obj.err
init arguments: init arguments: - out - capture stdout: True/False, default True - err - capture stdout: True/False, default
- out - capture stdout: True/False, default True True
- err - capture stdout: True/False, default True
Examples:: Examples::
with CaptureStdout() as cs: with CaptureStdout() as cs:
print("Secret message") print("Secret message")
print(f"captured: {cs.out}") print(f"captured: {cs.out}")
import sys import sys
with CaptureStderr() as cs: with CaptureStderr() as cs:
print("Warning: ", file=sys.stderr) print("Warning: ", file=sys.stderr)
print(f"captured: {cs.err}") print(f"captured: {cs.err}")
# to capture just one of the streams, but not the other # to capture just one of the streams, but not the other
with CaptureStd(err=False) as cs: with CaptureStd(err=False) as cs:
print("Secret message") print("Secret message")
print(f"captured: {cs.out}") print(f"captured: {cs.out}")
# but best use the stream-specific subclasses # but best use the stream-specific subclasses
""" """
...@@ -436,7 +432,8 @@ class CaptureStderr(CaptureStd): ...@@ -436,7 +432,8 @@ class CaptureStderr(CaptureStd):
class CaptureLogger: class CaptureLogger:
"""Context manager to capture `logging` streams """
Context manager to capture `logging` streams
Args: Args:
- logger: 'logging` logger object - logger: 'logging` logger object
...@@ -476,13 +473,12 @@ class CaptureLogger: ...@@ -476,13 +473,12 @@ class CaptureLogger:
class TestCasePlus(unittest.TestCase): class TestCasePlus(unittest.TestCase):
"""This class extends `unittest.TestCase` with additional features. """
This class extends `unittest.TestCase` with additional features.
Feature 1: Flexible auto-removable temp dirs which are guaranteed to get Feature 1: Flexible auto-removable temp dirs which are guaranteed to get removed at the end of test.
removed at the end of test.
In all the following scenarios the temp dir will be auto-removed at the end In all the following scenarios the temp dir will be auto-removed at the end of test, unless `after=False`.
of test, unless `after=False`.
# 1. create a unique temp dir, `tmp_dir` will contain the path to the created temp dir # 1. create a unique temp dir, `tmp_dir` will contain the path to the created temp dir
...@@ -491,38 +487,35 @@ class TestCasePlus(unittest.TestCase): ...@@ -491,38 +487,35 @@ class TestCasePlus(unittest.TestCase):
def test_whatever(self): def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir() tmp_dir = self.get_auto_remove_tmp_dir()
# 2. create a temp dir of my choice and delete it at the end - useful for debug when you want to # 2. create a temp dir of my choice and delete it at the end - useful for debug when you want to # monitor a
# monitor a specific directory specific directory
:: ::
def test_whatever(self): def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test") tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test")
# 3. create a temp dir of my choice and do not delete it at the end - useful for when you want # 3. create a temp dir of my choice and do not delete it at the end - useful for when you want # to look at the
# to look at the temp results temp results
:: ::
def test_whatever(self): def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", after=False) tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", after=False)
# 4. create a temp dir of my choice and ensure to delete it right away - useful for when you # 4. create a temp dir of my choice and ensure to delete it right away - useful for when you # disabled deletion in
# disabled deletion in the previous test run and want to make sure the that tmp dir is empty the previous test run and want to make sure the that tmp dir is empty # before the new test is run
# before the new test is run
:: ::
def test_whatever(self): def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", before=True) tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", before=True)
Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are
project repository checkout are allowed if an explicit `tmp_dir` is used, so allowed if an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem
that by mistake no `/tmp` or similar important part of the filesystem will will get nuked. i.e. please always pass paths that start with `./`
get nuked. i.e. please always pass paths that start with `./`
Note 2: Each test can register multiple temp dirs and they all will get Note 2: Each test can register multiple temp dirs and they all will get auto-removed, unless requested otherwise.
auto-removed, unless requested otherwise.
""" """
...@@ -540,8 +533,8 @@ class TestCasePlus(unittest.TestCase): ...@@ -540,8 +533,8 @@ class TestCasePlus(unittest.TestCase):
delete the tmp dir at the end of the test delete the tmp dir at the end of the test
Returns: Returns:
tmp_dir(:obj:`string`): tmp_dir(:obj:`string`): either the same value as passed via `tmp_dir` or the path to the auto-created tmp
either the same value as passed via `tmp_dir` or the path to the auto-created tmp dir dir
""" """
if tmp_dir is not None: if tmp_dir is not None:
# using provided path # using provided path
...@@ -577,11 +570,10 @@ class TestCasePlus(unittest.TestCase): ...@@ -577,11 +570,10 @@ class TestCasePlus(unittest.TestCase):
def mockenv(**kwargs): def mockenv(**kwargs):
"""this is a convenience wrapper, that allows this: """
this is a convenience wrapper, that allows this:
@mockenv(RUN_SLOW=True, USE_TF=False) @mockenv(RUN_SLOW=True, USE_TF=False) def test_something(): run_slow = os.getenv("RUN_SLOW", False) use_tf =
def test_something(): os.getenv("USE_TF", False)
run_slow = os.getenv("RUN_SLOW", False)
use_tf = os.getenv("USE_TF", False)
""" """
return unittest.mock.patch.dict(os.environ, kwargs) return unittest.mock.patch.dict(os.environ, kwargs)
...@@ -78,35 +78,33 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -78,35 +78,33 @@ class AlbertTokenizer(PreTrainedTokenizer):
.. note:: .. note::
When building a sequence using special tokens, this is not the token that is used for the beginning When building a sequence using special tokens, this is not the token that is used for the beginning of
of sequence. The token used is the :obj:`cls_token`. sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. The end of sequence token.
.. note:: .. note::
When building a sequence using special tokens, this is not the token that is used for the end When building a sequence using special tokens, this is not the token that is used for the end of
of sequence. The token used is the :obj:`sep_token`. sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
for sequence classification or for a text and a question for question answering. sequence classification or for a text and a question for question answering. It is also used as the last
It is also used as the last token of a sequence built with special tokens. token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole The classifier token which is used when doing sequence classification (classification of the whole sequence
sequence instead of per-token classification). It is the first token of the sequence when built with instead of per-token classification). It is the first token of the sequence when built with special tokens.
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
Attributes: Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
sp_model (:obj:`SentencePieceProcessor`): conversion (string, tokens and IDs).
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -224,9 +222,8 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -224,9 +222,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
by concatenating and adding special tokens. adding special tokens. An ALBERT sequence has the following format:
An ALBERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]`` - single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]``
...@@ -281,8 +278,8 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -281,8 +278,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Create a mask from the two sequences passed to be used in a sequence-pair classification task. Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
An ALBERT sequence pair mask has the following format: sequence pair mask has the following format:
:: ::
......
...@@ -71,10 +71,11 @@ SPIECE_UNDERLINE = "▁" ...@@ -71,10 +71,11 @@ SPIECE_UNDERLINE = "▁"
class AlbertTokenizerFast(PreTrainedTokenizerFast): class AlbertTokenizerFast(PreTrainedTokenizerFast):
""" """
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
`SentencePiece <https://github.com/google/sentencepiece>`__. <https://github.com/google/sentencepiece>`__. This tokenizer inherits from
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this
methods. Users should refer to this superclass for more information regarding those methods. superclass for more information regarding those methods
Args: Args:
vocab_file (:obj:`str`): vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
...@@ -87,31 +88,26 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): ...@@ -87,31 +88,26 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing. Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note:: .. note:: When building a sequence using special tokens, this is not the token that is used for the
When building a sequence using special tokens, this is not the token that is used for the beginning beginning of sequence. The token used is the :obj:`cls_token`.
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
.. note:: that is used for the end of sequence. The token used is the :obj:`sep_token`.
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
for sequence classification or for a text and a question for question answering. sequence classification or for a text and a question for question answering. It is also used as the last
It is also used as the last token of a sequence built with special tokens. token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole The classifier token which is used when doing sequence classification (classification of the whole sequence
sequence instead of per-token classification). It is the first token of the sequence when built with instead of per-token classification). It is the first token of the sequence when built with special tokens.
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict. Attributes:
Attributes:
sp_model (:obj:`SentencePieceProcessor`): sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
...@@ -162,9 +158,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): ...@@ -162,9 +158,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
by concatenating and adding special tokens. adding special tokens. An ALBERT sequence has the following format:
An ALBERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]`` - single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]``
...@@ -219,8 +214,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): ...@@ -219,8 +214,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
An ALBERT sequence pair mask has the following format: sequence pair mask has the following format:
:: ::
......
...@@ -221,8 +221,8 @@ SLOW_TOKENIZER_MAPPING = { ...@@ -221,8 +221,8 @@ SLOW_TOKENIZER_MAPPING = {
class AutoTokenizer: class AutoTokenizer:
r""" r"""
This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
when created with the :meth:`AutoTokenizer.from_pretrained` class method. created with the :meth:`AutoTokenizer.from_pretrained` class method.
This class cannot be instantiated directly using ``__init__()`` (throws an error). This class cannot be instantiated directly using ``__init__()`` (throws an error).
""" """
...@@ -257,8 +257,8 @@ class AutoTokenizer: ...@@ -257,8 +257,8 @@ class AutoTokenizer:
using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g., using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
``./my_model_directory/``. ``./my_model_directory/``.
- A path or url to a single saved vocabulary file if and only if the tokenizer only requires a - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. (Not
(Not applicable to all derived classes) applicable to all derived classes)
inputs (additional positional arguments, `optional`): inputs (additional positional arguments, `optional`):
Will be passed along to the Tokenizer ``__init__()`` method. Will be passed along to the Tokenizer ``__init__()`` method.
config (:class:`~transformers.PreTrainedConfig`, `optional`) config (:class:`~transformers.PreTrainedConfig`, `optional`)
...@@ -273,9 +273,8 @@ class AutoTokenizer: ...@@ -273,9 +273,8 @@ class AutoTokenizer:
Whether or not to delete incompletely received files. Will attempt to resume the download if such a Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists. file exists.
proxies (:obj:`Dict[str, str]`, `optional`): proxies (:obj:`Dict[str, str]`, `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g., A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
request.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`): use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to try to load the fast version of the tokenizer. Whether or not to try to load the fast version of the tokenizer.
kwargs (additional keyword arguments, `optional`): kwargs (additional keyword arguments, `optional`):
......
...@@ -44,8 +44,8 @@ class BartTokenizer(RobertaTokenizer): ...@@ -44,8 +44,8 @@ class BartTokenizer(RobertaTokenizer):
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch` :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
the initialization parameters and other methods. initialization parameters and other methods.
""" """
# merges and vocab same as Roberta # merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models} max_model_input_sizes = {m: 1024 for m in _all_bart_models}
...@@ -75,13 +75,13 @@ class BartTokenizer(RobertaTokenizer): ...@@ -75,13 +75,13 @@ class BartTokenizer(RobertaTokenizer):
tgt_texts: (:obj:`List[str]`, `optional`): tgt_texts: (:obj:`List[str]`, `optional`):
List of summaries or target language texts. List of summaries or target language texts.
max_length (:obj:`int`, `optional`): max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts). Controls the maximum length for encoder inputs (documents to summarize or source language texts). If
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
length is required by one of the truncation/padding parameters. If the model has no specific maximum is required by one of the truncation/padding parameters. If the model has no specific maximum input
input length (like XLNet) truncation/padding to a maximum length will be deactivated. length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`): max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries). Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or
If left unset or set to :obj:`None`, this will use the max_length value. set to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values: Activates and controls padding. Accepts the following values:
...@@ -122,8 +122,8 @@ class BartTokenizer(RobertaTokenizer): ...@@ -122,8 +122,8 @@ class BartTokenizer(RobertaTokenizer):
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **labels** -- List of token ids for tgt_texts - **labels** -- List of token ids for tgt_texts
The full set of keys ``[input_ids, attention_mask, labels]``, The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys. Otherwise, input_ids, attention_mask will be the only keys.
""" """
kwargs.pop("src_lang", None) kwargs.pop("src_lang", None)
kwargs.pop("tgt_lang", None) kwargs.pop("tgt_lang", None)
......
...@@ -70,13 +70,13 @@ class BartTokenizerFast(RobertaTokenizerFast): ...@@ -70,13 +70,13 @@ class BartTokenizerFast(RobertaTokenizerFast):
tgt_texts: (:obj:`List[str]`, `optional`): tgt_texts: (:obj:`List[str]`, `optional`):
List of summaries or target language texts. List of summaries or target language texts.
max_length (:obj:`int`, `optional`): max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts). Controls the maximum length for encoder inputs (documents to summarize or source language texts). If
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
length is required by one of the truncation/padding parameters. If the model has no specific maximum is required by one of the truncation/padding parameters. If the model has no specific maximum input
input length (like XLNet) truncation/padding to a maximum length will be deactivated. length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`): max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries). Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or
If left unset or set to :obj:`None`, this will use the max_length value. set to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values: Activates and controls padding. Accepts the following values:
...@@ -116,11 +116,11 @@ class BartTokenizerFast(RobertaTokenizerFast): ...@@ -116,11 +116,11 @@ class BartTokenizerFast(RobertaTokenizerFast):
- **input_ids** -- List of token ids to be fed to the encoder. - **input_ids** -- List of token ids to be fed to the encoder.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **decoder_input_ids** -- List of token ids to be fed to the decoder. - **decoder_input_ids** -- List of token ids to be fed to the decoder.
- **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the decoder. - **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the
This does not include causal mask, which is built by the model. decoder. This does not include causal mask, which is built by the model.
The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, will only
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys. be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
""" """
if max_length is None: if max_length is None:
max_length = self.model_max_length max_length = self.model_max_length
......
...@@ -135,15 +135,14 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -135,15 +135,14 @@ class BertTokenizer(PreTrainedTokenizer):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
for sequence classification or for a text and a question for question answering. sequence classification or for a text and a question for question answering. It is also used as the last
It is also used as the last token of a sequence built with special tokens. token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole The classifier token which is used when doing sequence classification (classification of the whole sequence
sequence instead of per-token classification). It is the first token of the sequence when built with instead of per-token classification). It is the first token of the sequence when built with special tokens.
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
...@@ -250,9 +249,8 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -250,9 +249,8 @@ class BertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
by concatenating and adding special tokens. adding special tokens. A BERT sequence has the following format:
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]`` - single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]``
...@@ -307,8 +305,8 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -307,8 +305,8 @@ class BertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Create a mask from the two sequences passed to be used in a sequence-pair classification task. Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
A BERT sequence pair mask has the following format: pair mask has the following format:
:: ::
...@@ -383,14 +381,14 @@ class BasicTokenizer(object): ...@@ -383,14 +381,14 @@ class BasicTokenizer(object):
self.strip_accents = strip_accents self.strip_accents = strip_accents
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
"""Basic Tokenization of a piece of text. """
Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Args: Args:
**never_split**: (`optional`) list of str **never_split**: (`optional`) list of str
Kept for backward compatibility purposes. Kept for backward compatibility purposes. Now implemented directly at the base class level (see
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
List of token not to split.
""" """
# union() returns a new set by concatenating the two sets. # union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
...@@ -512,14 +510,11 @@ class WordpieceTokenizer(object): ...@@ -512,14 +510,11 @@ class WordpieceTokenizer(object):
self.max_input_chars_per_word = max_input_chars_per_word self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text): def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces. """
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
This uses a greedy longest-match-first algorithm to perform tokenization tokenization using the given vocabulary.
using the given vocabulary.
For example: For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
input = "unaffable"
output = ["un", "##aff", "##able"]
Args: Args:
text: A single token or whitespace separated tokens. This should have text: A single token or whitespace separated tokens. This should have
......
...@@ -130,25 +130,23 @@ class BertTokenizerFast(PreTrainedTokenizerFast): ...@@ -130,25 +130,23 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
for sequence classification or for a text and a question for question answering. sequence classification or for a text and a question for question answering. It is also used as the last
It is also used as the last token of a sequence built with special tokens. token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole The classifier token which is used when doing sequence classification (classification of the whole sequence
sequence instead of per-token classification). It is the first token of the sequence when built with instead of per-token classification). It is the first token of the sequence when built with special tokens.
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to clean the text before tokenization by removing any control characters and Whether or not to clean the text before tokenization by removing any control characters and replacing all
replacing all whitespaces by the classic one. whitespaces by the classic one.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to tokenize Chinese characters. Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
This should likely be deactivated for Japanese (see `this issue issue <https://github.com/huggingface/transformers/issues/328>`__).
<https://github.com/huggingface/transformers/issues/328>`__).
strip_accents: (:obj:`bool`, `optional`): strip_accents: (:obj:`bool`, `optional`):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for :obj:`lowercase` (as in the original BERT). value for :obj:`lowercase` (as in the original BERT).
...@@ -204,9 +202,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast): ...@@ -204,9 +202,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
by concatenating and adding special tokens. adding special tokens. A BERT sequence has the following format:
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]`` - single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]``
...@@ -231,8 +228,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast): ...@@ -231,8 +228,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Create a mask from the two sequences passed to be used in a sequence-pair classification task. Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
A BERT sequence pair mask has the following format: pair mask has the following format:
:: ::
......
...@@ -94,13 +94,13 @@ class BertJapaneseTokenizer(BertTokenizer): ...@@ -94,13 +94,13 @@ class BertJapaneseTokenizer(BertTokenizer):
mecab_kwargs=None, mecab_kwargs=None,
**kwargs **kwargs
): ):
"""Constructs a MecabBertTokenizer. """
Constructs a MecabBertTokenizer.
Args: Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True) **do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input. Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
Only has an effect when do_basic_tokenize=True.
**do_word_tokenize**: (`optional`) boolean (default True) **do_word_tokenize**: (`optional`) boolean (default True)
Whether to do word tokenization. Whether to do word tokenization.
**do_subword_tokenize**: (`optional`) boolean (default True) **do_subword_tokenize**: (`optional`) boolean (default True)
...@@ -205,20 +205,20 @@ class MecabTokenizer: ...@@ -205,20 +205,20 @@ class MecabTokenizer:
mecab_dic: Optional[str] = "ipadic", mecab_dic: Optional[str] = "ipadic",
mecab_option: Optional[str] = None, mecab_option: Optional[str] = None,
): ):
"""Constructs a MecabTokenizer. """
Constructs a MecabTokenizer.
Args: Args:
**do_lower_case**: (`optional`) boolean (default True) **do_lower_case**: (`optional`) boolean (default True)
Whether to lowercase the input. Whether to lowercase the input.
**never_split**: (`optional`) list of str **never_split**: (`optional`) list of str
Kept for backward compatibility purposes. Kept for backward compatibility purposes. Now implemented directly at the base class level (see
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) :func:`PreTrainedTokenizer.tokenize`) List of tokens not to split.
List of tokens not to split.
**normalize_text**: (`optional`) boolean (default True) **normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization. Whether to apply unicode normalization to text before tokenization.
**mecab_dic**: (`optional`) string (default "ipadic") **mecab_dic**: (`optional`) string (default "ipadic")
Name of dictionary to be used for MeCab initialization. Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`. set thi option to `None` and modify `mecab_option`.
**mecab_option**: (`optional`) string **mecab_option**: (`optional`) string
String passed to MeCab constructor. String passed to MeCab constructor.
""" """
...@@ -306,7 +306,8 @@ class CharacterTokenizer: ...@@ -306,7 +306,8 @@ class CharacterTokenizer:
"""Runs Character tokenziation.""" """Runs Character tokenziation."""
def __init__(self, vocab, unk_token, normalize_text=True): def __init__(self, vocab, unk_token, normalize_text=True):
"""Constructs a CharacterTokenizer. """
Constructs a CharacterTokenizer.
Args: Args:
**vocab**: **vocab**:
...@@ -321,14 +322,15 @@ class CharacterTokenizer: ...@@ -321,14 +322,15 @@ class CharacterTokenizer:
self.normalize_text = normalize_text self.normalize_text = normalize_text
def tokenize(self, text): def tokenize(self, text):
"""Tokenizes a piece of text into characters. """
Tokenizes a piece of text into characters.
For example, :obj:`input = "apple""` wil return as output :obj:`["a", "p", "p", "l", "e"]`.
For example:
input = "apple"
output = ["a", "p", "p", "l", "e"]
Args: Args:
text: A single token or whitespace separated tokens. text: A single token or whitespace separated tokens.
This should have already been passed through `BasicTokenizer`. This should have already been passed through `BasicTokenizer`.
Returns: Returns:
A list of characters. A list of characters.
""" """
......
...@@ -50,7 +50,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -50,7 +50,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
def get_pairs(word): def get_pairs(word):
"""Return set of symbol pairs in a word. """
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings). Word is represented as tuple of symbols (symbols being variable-length strings).
""" """
...@@ -83,23 +84,22 @@ class BertweetTokenizer(PreTrainedTokenizer): ...@@ -83,23 +84,22 @@ class BertweetTokenizer(PreTrainedTokenizer):
.. note:: .. note::
When building a sequence using special tokens, this is not the token that is used for the beginning When building a sequence using special tokens, this is not the token that is used for the beginning of
of sequence. The token used is the :obj:`cls_token`. sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token. The end of sequence token.
.. note:: .. note::
When building a sequence using special tokens, this is not the token that is used for the end When building a sequence using special tokens, this is not the token that is used for the end of
of sequence. The token used is the :obj:`sep_token`. sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
for sequence classification or for a text and a question for question answering. sequence classification or for a text and a question for question answering. It is also used as the last
It is also used as the last token of a sequence built with special tokens. token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`): cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole The classifier token which is used when doing sequence classification (classification of the whole sequence
sequence instead of per-token classification). It is the first token of the sequence when built with instead of per-token classification). It is the first token of the sequence when built with special tokens.
special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
...@@ -178,9 +178,8 @@ class BertweetTokenizer(PreTrainedTokenizer): ...@@ -178,9 +178,8 @@ class BertweetTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
by concatenating and adding special tokens. adding special tokens. A BERTweet sequence has the following format:
A BERTweet sequence has the following format:
- single sequence: ``<s> X </s>`` - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>`` - pair of sequences: ``<s> A </s></s> B </s>``
...@@ -236,8 +235,8 @@ class BertweetTokenizer(PreTrainedTokenizer): ...@@ -236,8 +235,8 @@ class BertweetTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Create a mask from the two sequences passed to be used in a sequence-pair classification task. Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
BERTweet does not make use of token type ids, therefore a list of zeros is returned. not make use of token type ids, therefore a list of zeros is returned.
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (:obj:`List[int]`):
...@@ -411,8 +410,7 @@ class BertweetTokenizer(PreTrainedTokenizer): ...@@ -411,8 +410,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
def add_from_file(self, f): def add_from_file(self, f):
""" """
Loads a pre-existing dictionary from a text file and adds its symbols Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
to this instance.
""" """
if isinstance(f, str): if isinstance(f, str):
try: try:
...@@ -446,23 +444,17 @@ class BertweetTokenizer(PreTrainedTokenizer): ...@@ -446,23 +444,17 @@ class BertweetTokenizer(PreTrainedTokenizer):
""" """
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
domains and tasks. The basic logic is this:
1. The tuple regex_strings defines a list of regular expression 1. The tuple regex_strings defines a list of regular expression strings.
strings.
2. The regex_strings strings are put, in order, into a compiled 2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.
regular expression object called word_re.
3. The tokenization is done by word_re.findall(s), where s is the 3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
user-supplied string, inside the tokenize() method of the class the class Tokenizer.
Tokenizer.
4. When instantiating Tokenizer objects, there is a single option: 4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
preserve_case. By default, it is set to True. If it is set to is set to False, then the tokenizer will downcase everything except for emoticons.
False, then the tokenizer will downcase everything except for
emoticons.
""" """
...@@ -582,6 +574,7 @@ REGEXPS = ( ...@@ -582,6 +574,7 @@ REGEXPS = (
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""", r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
# email addresses # email addresses
r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""", r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
# docstyle-ignore
# Remaining word types: # Remaining word types:
r""" r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes. (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
...@@ -627,28 +620,24 @@ def _str_to_unicode(text, encoding=None, errors="strict"): ...@@ -627,28 +620,24 @@ def _str_to_unicode(text, encoding=None, errors="strict"):
def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"): def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
""" """
Remove entities from text by converting them to their Remove entities from text by converting them to their corresponding unicode character.
corresponding unicode character.
Args: Args:
text: text:
A unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8'). A unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8').
keep (list): keep (list):
List of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``) List of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and
and named entities (such as ``&nbsp;`` or ``&gt;``). ``&#hhhh;``) and named entities (such as ``&nbsp;`` or ``&gt;``).
remove_illegal (bool): remove_illegal (bool):
If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is". If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
kept "as is".
Returns: A unicode string with the entities removed. Returns: A unicode string with the entities removed.
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
>>> from nltk.tokenize.casual import _replace_html_entities >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: &pound;100')
>>> _replace_html_entities(b'Price: &pound;100') 'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: &pound;100')) Price: £100 >>>
'Price: \\xa3100'
>>> print(_replace_html_entities(b'Price: &pound;100'))
Price: £100
>>>
""" """
def _convert_entity(match): def _convert_entity(match):
...@@ -714,8 +703,8 @@ class TweetTokenizer: ...@@ -714,8 +703,8 @@ class TweetTokenizer:
Args: Args:
text: str text: str
Returns: list(str) Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
A tokenized list of strings; concatenating this list returns the original string if `preserve_case=False` `preserve_case=False`
""" """
# Fix HTML character entities: # Fix HTML character entities:
text = _replace_html_entities(text) text = _replace_html_entities(text)
...@@ -742,8 +731,7 @@ class TweetTokenizer: ...@@ -742,8 +731,7 @@ class TweetTokenizer:
def reduce_lengthening(text): def reduce_lengthening(text):
""" """
Replace repeated character sequences of length 3 or greater with sequences Replace repeated character sequences of length 3 or greater with sequences of length 3.
of length 3.
""" """
pattern = regex.compile(r"(.)\1{2,}") pattern = regex.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1\1", text) return pattern.sub(r"\1\1\1", text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment