"csrc/vscode:/vscode.git/clone" did not exist on "5ab9b3667bc39a0063ed588be63fe8989d9da080"
Unverified Commit 3323146e authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Models doc (#7345)



* Clean up model documentation

* Formatting

* Preparation work

* Long lines

* Main work on rst files

* Cleanup all config files

* Syntax fix

* Clean all tokenizers

* Work on first models

* Models beginning

* FaluBERT

* All PyTorch models

* All models

* Long lines again

* Fixes

* More fixes

* Update docs/source/model_doc/bert.rst
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>

* Update docs/source/model_doc/electra.rst
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>

* Last fixes
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
parent 58405a52
......@@ -51,45 +51,44 @@ SPIECE_UNDERLINE = "▁"
class CamembertTokenizer(PreTrainedTokenizer):
"""
Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities:
Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Construct a
CamemBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
......@@ -146,19 +145,19 @@ class CamembertTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A CamemBERT sequence has the following format:
An CamemBERT sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
......@@ -171,16 +170,16 @@ class CamembertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
......@@ -201,18 +200,17 @@ class CamembertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......
......@@ -116,19 +116,17 @@ def get_pairs(word):
class CTRLTokenizer(PreTrainedTokenizer):
"""
Constructs a CTRL tokenizer. Peculiarities:
Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.
- Byte-Pair-Encoding
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
......
......@@ -55,10 +55,10 @@ PRETRAINED_INIT_CONFIGURATION = {
class DistilBertTokenizer(BertTokenizer):
r"""
Constructs a DistilBertTokenizer.
Construct a DistilBERT tokenizer.
:class:`~transformers.DistilBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -73,10 +73,10 @@ class DistilBertTokenizer(BertTokenizer):
class DistilBertTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" DistilBertTokenizer (backed by HuggingFace's `tokenizers` library).
Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......
......@@ -68,10 +68,10 @@ READER_PRETRAINED_INIT_CONFIGURATION = {
class DPRContextEncoderTokenizer(BertTokenizer):
r"""
Constructs a DPRContextEncoderTokenizer.
Construct a DPRContextEncoder tokenizer.
:class:`~transformers.DPRContextEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.DPRContextEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -85,10 +85,10 @@ class DPRContextEncoderTokenizer(BertTokenizer):
class DPRContextEncoderTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" DPRContextEncoderTokenizer (backed by HuggingFace's `tokenizers` library).
Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and
runs end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......@@ -102,10 +102,10 @@ class DPRContextEncoderTokenizerFast(BertTokenizerFast):
class DPRQuestionEncoderTokenizer(BertTokenizer):
r"""
Constructs a DPRQuestionEncoderTokenizer.
Constructs a DPRQuestionEncoder tokenizer.
:class:`~transformers.DPRQuestionEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.DPRQuestionEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -119,10 +119,10 @@ class DPRQuestionEncoderTokenizer(BertTokenizer):
class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" DPRQuestionEncoderTokenizer (backed by HuggingFace's `tokenizers` library).
Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and
runs end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......@@ -142,59 +142,71 @@ DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "en
CUSTOM_DPR_READER_DOCSTRING = r"""
Return a dictionary with the token ids of the input strings and other information to give to :obj:`.decode_best_spans`.
It converts the strings of a question and different passages (title + text) in a sequence of ids (integer), using the tokenizer and vocabulary.
The resulting `input_ids` is a matrix of size :obj:`(n_passages, sequence_length)` with the format:
Return a dictionary with the token ids of the input strings and other information to give to
:obj:`.decode_best_spans`.
It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers),
using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size
:obj:`(n_passages, sequence_length)` with the format:
[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
Inputs:
questions (:obj:`str`, :obj:`List[str]`):
Args:
questions (:obj:`str` or :obj:`List[str]`):
The questions to be encoded.
You can specify one question for many passages. In this case, the question will be duplicated like :obj:`[questions] * n_passages`.
You can specify one question for many passages. In this case, the question will be duplicated like
:obj:`[questions] * n_passages`.
Otherwise you have to specify as many questions as in :obj:`titles` or :obj:`texts`.
titles (:obj:`str`, :obj:`List[str]`):
The passages titles to be encoded. This can be a string, a list of strings if there are several passages.
texts (:obj:`str`, :obj:`List[str]`):
The passages texts to be encoded. This can be a string, a list of strings if there are several passages.
padding (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`):
Activate and control padding. Accepts the following values:
* `True` or `'longest'`: pad to the longest sequence in the batch (or no padding if only a single sequence if provided),
* `'max_length'`: pad to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`)
* `False` or `'do_not_pad'` (default): No padding (i.e. can output batch with sequences of uneven lengths)
truncation (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`):
Activate and control truncation. Accepts the following values:
* `True` or `'only_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`).
* `False` or `'do_not_truncate'` (default): No truncation (i.e. can output batch with sequences length greater than the model max admissible input size)
max_length (:obj:`Union[int, None]`, `optional`):
Control the length for padding/truncation. Accepts the following values
* `None` (default): This will use the predefined model max length if required by one of the truncation/padding parameters. If the model has no specific max input length (e.g. XLNet) truncation/padding to max length is deactivated.
* `any integer value` (e.g. `42`): Use this specific maximum length value if required by one of the truncation/padding parameters.
return_tensors (:obj:`str`, `optional`):
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`,
PyTorch :obj:`torch.Tensor` or Numpy :obj: `np.ndarray` instead of a list of python integers.
return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
titles (:obj:`str` or :obj:`List[str]`):
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
texts (:obj:`str` or :obj:`List[str]`):
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls truncation. Accepts the following values:
* :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair
if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
sequence lengths greater than the model maximum admissible input size).
max_length (:obj:`int`, `optional`):
Controls the maximum length to use by one of the truncation/padding parameters.
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
return_attention_mask (:obj:`bool`, `optional`):
Whether or not to return the attention mask. If not set, will return the attention mask according to the
specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
Return:
A Dictionary of shape::
{
input_ids: list[list[int]],
attention_mask: list[int] if return_attention_mask is True (default)
}
With the fields:
- ``input_ids``: list of token ids to be fed to a model
- ``attention_mask``: list of indices specifying which tokens should be attended to by the model
:obj:`Dict[str, List[List[int]]]`: A dictionary with the following keys:
- ``input_ids``: List of token ids to be fed to a model.
- ``attention_mask``: List of indices specifying which tokens should be attended to by the model.
"""
......@@ -369,16 +381,14 @@ class CustomDPRReaderTokenizerMixin:
@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
r"""
Constructs a DPRReaderTokenizer.
:class:`~transformers.DPRReaderTokenizer` is alsmost identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Construct a DPRReader tokenizer.
What is different is that is has three inputs strings: question, titles and texts that are combined to feed into the DPRReader model.
:class:`~transformers.DPRReaderTokenizer` is almost identical to :class:`~transformers.BertTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs strings:
question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -391,14 +401,13 @@ class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
r"""
Constructs a DPRReaderTokenizerFast.
:class:`~transformers.DPRReaderTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's `tokenizers` library).
What is different is that is has three inputs strings: question, titles and texts that are combined to feed into the DPRReader model.
:class:`~transformers.DPRReaderTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and
runs end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs
strings: question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
......
......@@ -51,9 +51,10 @@ PRETRAINED_INIT_CONFIGURATION = {
class ElectraTokenizer(BertTokenizer):
r"""
Constructs an Electra tokenizer.
Construct an ELECTRA tokenizer.
:class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -67,10 +68,10 @@ class ElectraTokenizer(BertTokenizer):
class ElectraTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" Electra Fast tokenizer (backed by HuggingFace's `tokenizers` library).
Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......
......@@ -78,13 +78,13 @@ def convert_to_unicode(text):
class FlaubertTokenizer(XLMTokenizer):
"""
BPE tokenizer for Flaubert
Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
- Moses preprocessing & tokenization
- Normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
(like "__classify__") to a vocabulary.
- The argument :obj:`do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
and documentation regarding arguments.
......
......@@ -122,44 +122,43 @@ def remove_non_printing_char(text):
class FSMTTokenizer(PreTrainedTokenizer):
"""
BPE tokenizer for FSMT (fairseq transformer)
See: https://github.com/pytorch/fairseq/tree/master/examples/wmt19
Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
- Moses preprocessing & tokenization for most supported languages
- (optionally) lower case & normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- `langs` defines a pair of languages
- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
(like "__classify__") to a vocabulary.
- The argument :obj:`langs` defines a pair of languages.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
langs (:obj:`List[str]`):
a list of two languages to translate from and to, e.g. ``["en", "ru"]``.
src_vocab_file (:obj:`string`):
Source language vocabulary file.
tgt_vocab_file (:obj:`string`):
Target language vocabulary file.
merges_file (:obj:`string`):
Merges file.
A list of two languages to translate from and to, for instance :obj:`["en", "ru"]`.
src_vocab_file (:obj:`str`):
File containing the vocabulary for the source language.
tgt_vocab_file (:obj:`st`):
File containing the vocabulary for the target language.
merges_file (:obj:`str`):
File containing the merges.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
Whether or not to lowercase the input when tokenizing.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
"""
......@@ -369,20 +368,19 @@ class FSMTTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A FAIRSEQ_TRANSFORMER sequence has the following format:
A FAIRSEQ Transformer sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
sep = [self.sep_token_id]
......@@ -395,16 +393,16 @@ class FSMTTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` methods.
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
......@@ -431,25 +429,28 @@ class FSMTTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A FAIRSEQ Transformer sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
"""
sep = [self.sep_token_id]
......@@ -470,8 +471,6 @@ class FSMTTokenizer(PreTrainedTokenizer):
padding="longest",
**unused,
) -> BatchEncoding:
"""Prepare model inputs for translation. For best performance, translate one sentence at a time."""
if type(src_texts) is not list:
raise ValueError("src_texts is expected to be a list")
if "" in src_texts:
......@@ -499,7 +498,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
vocab_path (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
......
......@@ -57,10 +57,10 @@ PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case":
class FunnelTokenizer(BertTokenizer):
r"""
Tokenizer for the Funnel Transformer models.
Construct a Funnel Transformer tokenizer.
:class:`~transformers.FunnelTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -110,19 +110,19 @@ class FunnelTokenizer(BertTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Funnel Transformer expects a sequence pair mask that has the following format:
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A Funnel Transformer sequence pair mask has the following format:
::
2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0's).
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
......@@ -139,10 +139,10 @@ class FunnelTokenizer(BertTokenizer):
class FunnelTokenizerFast(BertTokenizerFast):
r"""
"Fast" tokenizer for the Funnel Transformer models (backed by HuggingFace's :obj:`tokenizers` library).
Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.FunnelTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting + wordpiece.
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......@@ -192,19 +192,19 @@ class FunnelTokenizerFast(BertTokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Funnel Transformer expects a sequence pair mask that has the following format:
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A Funnel Transformer sequence pair mask has the following format:
::
2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0's).
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
......
......@@ -103,7 +103,7 @@ def get_pairs(word):
class GPT2Tokenizer(PreTrainedTokenizer):
"""
GPT-2 BPE tokenizer, using byte-level Byte-Pair-Encoding.
Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
......@@ -124,24 +124,27 @@ class GPT2Tokenizer(PreTrainedTokenizer):
When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The end of sequence token.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -305,7 +308,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
class GPT2TokenizerFast(PreTrainedTokenizerFast):
"""
Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library), using byte-level
Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
......@@ -328,30 +331,29 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
``add_prefix_space=True``.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The end of sequence token.
add_prefix_space (:obj:`bool`, `optional`, defaults to `False`):
Whether to add a leading space to the first word.
This allows to treat the leading word just as any other word.
(GPT2 tokenizer detect beginning of words by the preceeding space)
trim_offsets (:obj:`bool`, `optional`, defaults to `True`):
Whether the post processing step should trim offsets to avoid including whitespaces.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -42,6 +42,12 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class LongformerTokenizer(RobertaTokenizer):
r"""
Construct a Longformer tokenizer.
:class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to
the superclass for usage examples and documentation concerning parameters.
"""
# merges and vocab same as Roberta
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
......@@ -51,6 +57,12 @@ class LongformerTokenizer(RobertaTokenizer):
class LongformerTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
to the superclass for usage examples and documentation concerning parameters.
"""
# merges and vocab same as Roberta
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
......
......@@ -50,9 +50,10 @@ PRETRAINED_INIT_CONFIGURATION = {
class LxmertTokenizer(BertTokenizer):
r"""
Constructs an Lxmert tokenizer.
Construct an LXMERT tokenizer.
:class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -66,10 +67,10 @@ class LxmertTokenizer(BertTokenizer):
class LxmertTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" Lxmert Fast tokenizer (backed by HuggingFace's `tokenizers` library).
Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......
......@@ -137,7 +137,6 @@ class MarianTokenizer(PreTrainedTokenizer):
padding="longest",
**unused,
) -> BatchEncoding:
"""Prepare model inputs for translation. For best performance, translate one sentence at a time."""
if "" in src_texts:
raise ValueError(f"found empty string in src_texts: {src_texts}")
self.current_spm = self.spm_source
......
......@@ -171,67 +171,6 @@ class MBartTokenizer(XLMRobertaTokenizer):
add_prefix_space: bool = False, # ignored
**kwargs,
) -> BatchEncoding:
"""Prepare a batch that can be passed directly to an instance of MBartModel.
Arguments:
src_texts: (:obj:`list`):
list of documents to summarize or source language texts
src_lang: (:obj:`str`, `optional`, default='en_XX'):
default en_XX (english), the language we are translating from
tgt_texts: (:obj:`list`, `optional`):
list of tgt language texts or summaries.
tgt_lang: (:obj:`str`, `optional`, default='ro_RO'):
default ro_RO (romanian), the language we are translating to
max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts)
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries)
If left unset or set to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`):
Activates and controls truncation. Accepts the following values:
* :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair
if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
sequence lengths greater than the model maximum admissible input size).
Return:
:class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
- **input_ids** -- List of token ids to be fed to the encoder.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **labels** -- List of token ids for tgt_texts
The full set of keys ``[input_ids, attention_mask, decoder_input_ids, labels]``,
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
"""
if max_length is None:
max_length = self.max_len
self.set_src_lang_special_tokens(src_lang)
......
......@@ -35,10 +35,10 @@ PRETRAINED_INIT_CONFIGURATION = {}
class MobileBertTokenizer(BertTokenizer):
r"""
Constructs a MobileBertTokenizer.
Construct a MobileBERT tokenizer.
:class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -52,10 +52,10 @@ class MobileBertTokenizer(BertTokenizer):
class MobileBertTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" MobileBertTokenizer (backed by HuggingFace's `tokenizers` library).
Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......
......@@ -75,20 +75,21 @@ def text_standardize(text):
class OpenAIGPTTokenizer(PreTrainedTokenizer):
"""
BPE tokenizer. Peculiarities:
Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
- lowercases all inputs,
- uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
:obj:`BasicTokenizer` if not.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
......@@ -206,7 +207,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
vocab_path (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
......@@ -239,22 +240,22 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "Fast" BPE tokenizer for OpenAI GPT (backed by HuggingFace's `tokenizers` library).
Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
the following peculiarities:
Peculiarities:
- lowercases all inputs,
- uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
:obj:`BasicTokenizer` if not.
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
......
......@@ -66,46 +66,44 @@ def get_pairs(word):
class PhobertTokenizer(PreTrainedTokenizer):
"""
Constructs a PhoBERT tokenizer. Peculiarities:
Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.
- Byte-Pair-Encoding
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
bos_token (:obj:`st`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
......@@ -171,12 +169,12 @@ class PhobertTokenizer(PreTrainedTokenizer):
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
......@@ -189,16 +187,16 @@ class PhobertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` methods.
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
......@@ -220,18 +218,17 @@ class PhobertTokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
PhoBERT does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
......@@ -318,9 +315,11 @@ class PhobertTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory):
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
......
......@@ -53,26 +53,26 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class ReformerTokenizer(PreTrainedTokenizer):
"""
Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
Construct a Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`string`):
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
......@@ -165,8 +165,15 @@ class ReformerTokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
"""Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
......@@ -40,10 +40,10 @@ PRETRAINED_INIT_CONFIGURATION = {
class RetriBertTokenizer(BertTokenizer):
r"""
Constructs a retribert.
Constructs a RetriBERT tokenizer.
:class:`~transformers.retribert is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.RetroBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
......@@ -58,10 +58,10 @@ class RetriBertTokenizer(BertTokenizer):
class RetriBertTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" RetriBertTokenizerFast (backed by HuggingFace's `tokenizers` library).
Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
:class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
......
......@@ -62,7 +62,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class RobertaTokenizer(GPT2Tokenizer):
"""
Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
......@@ -83,47 +83,50 @@ class RobertaTokenizer(GPT2Tokenizer):
When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -184,12 +187,12 @@ class RobertaTokenizer(GPT2Tokenizer):
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
......@@ -201,16 +204,16 @@ class RobertaTokenizer(GPT2Tokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
......@@ -231,18 +234,17 @@ class RobertaTokenizer(GPT2Tokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -267,7 +269,7 @@ class RobertaTokenizer(GPT2Tokenizer):
class RobertaTokenizerFast(GPT2TokenizerFast):
"""
Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
tokenizer, using byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
......@@ -290,29 +292,51 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
``add_prefix_space=True``.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The end of sequence token.
add_prefix_space (:obj:`bool`, `optional`, defaults to `False`):
Whether to add a leading space to the first word.
This allows to treat the leading word just as any other word.
(GPT2 tokenizer detect beginning of words by the preceeding space)
trim_offsets (:obj:`bool`, `optional`, defaults to `True`):
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether the post processing step should trim offsets to avoid including whitespaces.
"""
......@@ -377,18 +401,17 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......
......@@ -21,7 +21,9 @@ import warnings
from shutil import copyfile
from typing import List, Optional
from .file_utils import add_start_docstrings
from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
from .utils import logging
......@@ -61,32 +63,33 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class T5Tokenizer(PreTrainedTokenizer):
"""
Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`string`):
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
extra_ids (:obj:`int`, `optional`, defaults to 100):
Add a number of extra ids added to the end of the vocabulary for use as sentinels.
These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token
in the vocabulary like in T5 preprocessing see `here
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
"""
......@@ -149,19 +152,19 @@ class T5Tokenizer(PreTrainedTokenizer):
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1], 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
......@@ -191,7 +194,6 @@ class T5Tokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
For some t5 tasks, model.config.prefix is specified. This must be used before tokenization.
A sequence has the following format:
- single sequence: ``X </s>``
......@@ -199,12 +201,12 @@ class T5Tokenizer(PreTrainedTokenizer):
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
......@@ -261,8 +263,15 @@ class T5Tokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
"""Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......@@ -274,6 +283,7 @@ class T5Tokenizer(PreTrainedTokenizer):
return (out_vocab_file,)
@add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
......@@ -285,59 +295,6 @@ class T5Tokenizer(PreTrainedTokenizer):
truncation: bool = True,
**kwargs,
) -> BatchEncoding:
r"""
Prepare a batch that can be passed directly to an instance of :class:`~transformers.T5Model`.
Args:
src_texts: (:obj:`List[str]`):
List of documents to summarize or source language texts.
tgt_texts: (:obj:`List[str]`, `optional`):
List of summaries or target language texts.
max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts).
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries).
If left unset or set to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`):
Activates and controls truncation. Accepts the following values:
* :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair
if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
sequence lengths greater than the model maximum admissible input size).
**kwargs:
Additional keyword arguments passed along to :obj:`self.__call__`.
Returns:
:class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
- **input_ids** -- List of token ids to be fed to the encoder.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **labels** -- List of token ids for tgt_texts
The full set of keys ``[input_ids, attention_mask, decoder_input_ids, labels]``,
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
"""
if max_length is None:
max_length = self.max_len
model_inputs = self(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment