""" An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
""" Base class for all tokenizers.
Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
Derived class can set up a few special tokens to be used in common scripts and internals:
This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
Class attributes (overridden by derived classes):
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
Parameters:
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
"""
vocab_files_names={}
pretrained_vocab_files_map={}
...
...
@@ -49,48 +69,56 @@ class PreTrainedTokenizer(object):
@property
defbos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """
ifself._bos_tokenisNone:
logger.error("Using bos_token, but it is not set yet.")
returnself._bos_token
@property
defeos_token(self):
""" End of sentence token (string). Log an error if used while not having been set. """
ifself._eos_tokenisNone:
logger.error("Using eos_token, but it is not set yet.")
returnself._eos_token
@property
defunk_token(self):
""" Unknown token (string). Log an error if used while not having been set. """
ifself._unk_tokenisNone:
logger.error("Using unk_token, but it is not set yet.")
returnself._unk_token
@property
defsep_token(self):
""" Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
ifself._sep_tokenisNone:
logger.error("Using sep_token, but it is not set yet.")
returnself._sep_token
@property
defpad_token(self):
""" Padding token (string). Log an error if used while not having been set. """
ifself._pad_tokenisNone:
logger.error("Using pad_token, but it is not set yet.")
returnself._pad_token
@property
defcls_token(self):
""" Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
ifself._cls_tokenisNone:
logger.error("Using cls_token, but it is not set yet.")
returnself._cls_token
@property
defmask_token(self):
""" Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
ifself._mask_tokenisNone:
logger.error("Using mask_token, but it is not set yet.")
returnself._mask_token
@property
defadditional_special_tokens(self):
""" All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
ifself._additional_special_tokensisNone:
logger.error("Using additional_special_tokens, but it is not set yet.")
returnself._additional_special_tokens
...
...
@@ -138,48 +166,119 @@ class PreTrainedTokenizer(object):
Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
Args:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
Force to (re-)download the vocabulary files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
Examples::
# We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
@@ -251,8 +350,9 @@ class PreTrainedTokenizer(object):
defsave_pretrained(self,save_directory):
""" Save the tokenizer vocabulary files (with added tokens) and the
special-tokens-to-class-attributes-mapping to a directory, so that it
can be re-loaded using the `from_pretrained(save_directory)` class method.
special-tokens-to-class-attributes-mapping to a directory.
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
"""
ifnotos.path.isdir(save_directory):
logger.error("Saving directory ({}) should be a directory".format(save_directory))
...
...
@@ -266,7 +366,7 @@ class PreTrainedTokenizer(object):
@@ -277,38 +377,53 @@ class PreTrainedTokenizer(object):
defsave_vocabulary(self,save_directory):
""" Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
and special token mappings.
Please use `save_pretrained()` to save the full Tokenizer state so that it can be
reloaded using the `from_pretrained(save_directory)` class method.
Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
"""
raiseNotImplementedError
defvocab_size(self):
""" Size of the base vocabulary (without the added tokens) """
raiseNotImplementedError
def__len__(self):
""" Size of the full vocabulary with the added tokens """
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the
vocabulary, they are added to the added_tokens_encoder with indices starting from
the last index of the current vocabulary.
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the
vocabulary, they are added to it with indices starting from length of the current vocabulary.
Args:
new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
Returns:
Number of tokens added to the vocabulary.
Examples::
# Let's see how to increase the vocabulary of Bert model and tokenizer
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.