Unverified Commit bb7c4685 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Documentation (#2989)

* All Tokenizers

BertTokenizer + few fixes
RobertaTokenizer
OpenAIGPTTokenizer + Fixes
GPT2Tokenizer + fixes
TransfoXLTokenizer
Correct rst for TransformerXL
XLMTokenizer + fixes
XLNet Tokenizer + Style
DistilBERT + Fix XLNet RST
CTRLTokenizer
CamemBERT Tokenizer
FlaubertTokenizer
XLMRobertaTokenizer
cleanup

* cleanup
parent c913eb9c
...@@ -116,8 +116,21 @@ def get_pairs(word): ...@@ -116,8 +116,21 @@ def get_pairs(word):
class CTRLTokenizer(PreTrainedTokenizer): class CTRLTokenizer(PreTrainedTokenizer):
""" """
CTRL BPE tokenizer. Peculiarities: Constructs a CTRL tokenizer. Peculiarities:
- Byte-Pair-Encoding
- Byte-Pair-Encoding
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer): ...@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer):
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = { ...@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = {
class DistilBertTokenizer(BertTokenizer): class DistilBertTokenizer(BertTokenizer):
r""" r"""
Constructs a DistilBertTokenizer. Constructs a DistilBertTokenizer.
:class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True parameters.
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_basic_tokenize=True
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer): ...@@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer):
""" """
BPE tokenizer for Flaubert BPE tokenizer for Flaubert
- Moses preprocessing & tokenization - Moses preprocessing & tokenization
- Normalize all inputs text
- Normalize all inputs text - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
(ex: "__classify__") to a vocabulary
This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) and documentation regarding arguments.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -101,11 +101,35 @@ def get_pairs(word): ...@@ -101,11 +101,35 @@ def get_pairs(word):
class GPT2Tokenizer(PreTrainedTokenizer): class GPT2Tokenizer(PreTrainedTokenizer):
""" """
GPT-2 BPE tokenizer. Peculiarities: GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding and tokenize methods should be called with the - Byte-level Byte-Pair-Encoding
``add_prefix_space`` flag set to ``True``. - Requires a space to start the input string => the encoding methods should be called with the
Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve ``add_prefix_space`` flag set to ``True``.
the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"` Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string:
::
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The end of sequence token.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
return text return text
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -82,8 +82,21 @@ def text_standardize(text): ...@@ -82,8 +82,21 @@ def text_standardize(text):
class OpenAIGPTTokenizer(PreTrainedTokenizer): class OpenAIGPTTokenizer(PreTrainedTokenizer):
""" """
BPE tokenizer. Peculiarities: BPE tokenizer. Peculiarities:
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. - lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import logging import logging
from typing import List, Optional
from tokenizers.processors import RobertaProcessing from tokenizers.processors import RobertaProcessing
...@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class RobertaTokenizer(GPT2Tokenizer): class RobertaTokenizer(GPT2Tokenizer):
""" """
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the - Byte-level Byte-Pair-Encoding
``add_prefix_space`` flag set to ``True``. - Requires a space to start the input string => the encoding methods should be called with the
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve ``add_prefix_space`` flag set to ``True``.
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string:
::
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer):
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
...@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: if token_ids_1 is not None:
...@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned. RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
if token_ids_1 is None, only returns the first portion of the mask (0's). Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
......
...@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin" ...@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin"
class TransfoXLTokenizer(PreTrainedTokenizer): class TransfoXLTokenizer(PreTrainedTokenizer):
""" """
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
raise ValueError("No <unkown> token in vocabulary") raise ValueError("No <unkown> token in vocabulary")
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file.""" """
Save the vocabulary and special tokens file to a directory.
Args:
vocab_path (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
logger.warning( logger.warning(
"Please note you will not be able to load the save vocabulary in" "Please note you will not be able to load the save vocabulary in"
......
...@@ -21,6 +21,7 @@ import os ...@@ -21,6 +21,7 @@ import os
import re import re
import sys import sys
import unicodedata import unicodedata
from typing import List, Optional
import sacremoses as sm import sacremoses as sm
...@@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer):
""" """
BPE tokenizer for XLM BPE tokenizer for XLM
- Moses preprocessing & tokenization for most supported languages - Moses preprocessing & tokenization for most supported languages
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP) - (optionally) lower case & normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
- (optionally) lower case & normalize all inputs text (ex: "__classify__") to a vocabulary
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
(ex: "__classify__") to a vocabulary
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) should refer to the superclass for more information regarding methods.
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) Args:
vocab_file (:obj:`string`):
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies) Vocabulary file.
merges_file (:obj:`string`):
Merges file.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "</s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "<special1>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
List of additional special tokens.
lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`):
Dictionary mapping languages string identifiers to their IDs.
id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`):
Dictionary mapping language IDs to their string identifiers.
do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase and remove accents when tokenizing.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = "".join(tokens).replace("</w>", " ").strip() out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A XLM sequence has the following format: A XLM sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s> B </s> - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
...@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
...@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format: An XLM sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's). ::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
import logging import logging
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional
from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils import PreTrainedTokenizer
...@@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
Adapted from RobertaTokenizer and XLNetTokenizer Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities: SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_ - requires `SentencePiece <https://github.com/google/sentencepiece>`_
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: A XLM-R sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: if token_ids_1 is not None:
raise ValueError( raise ValueError(
...@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned. XLM-R does not make use of token type ids, therefore a list of zeros is returned.
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """
to a directory. Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
...@@ -19,6 +19,7 @@ import logging ...@@ -19,6 +19,7 @@ import logging
import os import os
import unicodedata import unicodedata
from shutil import copyfile from shutil import copyfile
from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
...@@ -51,9 +52,57 @@ SEG_ID_PAD = 4 ...@@ -51,9 +52,57 @@ SEG_ID_PAD = 4
class XLNetTokenizer(PreTrainedTokenizer): class XLNetTokenizer(PreTrainedTokenizer):
""" """
SentencePiece based tokenizer. Peculiarities: Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
- requires `SentencePiece <https://github.com/google/sentencepiece>`_ This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "<sep>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "<cls>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
An XLNet sequence has the following format: An XLNet sequence has the following format:
single sequence: X <sep> <cls>
pair of sequences: A <sep> B <sep> <cls> - single sequence: ``X <sep> <cls>``
- pair of sequences: ``A <sep> B <sep> <cls>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer):
return token_ids_0 + sep + cls return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
...@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1] return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLNet sequence pair mask has the following format: An XLNet sequence pair mask has the following format:
...@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer):
| first sequence | second sequence | CLS segment ID | first sequence | second sequence | CLS segment ID
if token_ids_1 is None, only returns the first portion of the mask (0's). if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls_segment_id = [2] cls_segment_id = [2]
...@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """
to a directory. Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment