@@ -180,9 +180,10 @@ class PreTrainedTokenizer(object):
...
@@ -180,9 +180,10 @@ class PreTrainedTokenizer(object):
@classmethod
@classmethod
deffrom_pretrained(cls,*inputs,**kwargs):
deffrom_pretrained(cls,*inputs,**kwargs):
r""" Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
r"""
Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
Parameters:
Args:
pretrained_model_name_or_path: either:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
...
@@ -383,14 +384,15 @@ class PreTrainedTokenizer(object):
...
@@ -383,14 +384,15 @@ class PreTrainedTokenizer(object):
defadd_tokens(self,new_tokens):
defadd_tokens(self,new_tokens):
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the
vocabulary, they are added to it with indices starting from length of the current vocabulary.
vocabulary, they are added to it with indices starting from length of the current vocabulary.
Parameters:
Args:
new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
Returns:
Returns:
Number of tokens added to the vocabulary.
Number of tokens added to the vocabulary.
Examples::
Examples::
...
@@ -422,17 +424,20 @@ class PreTrainedTokenizer(object):
...
@@ -422,17 +424,20 @@ class PreTrainedTokenizer(object):
defadd_special_tokens(self,special_tokens_dict):
defadd_special_tokens(self,special_tokens_dict):
""" Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
"""
to class attributes. If special tokens are NOT in the vocabulary, they are added
Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
to it (indexed starting from the last index of the current vocabulary).
to class attributes. If special tokens are NOT in the vocabulary, they are added
to it (indexed starting from the last index of the current vocabulary).
Parameters:
Args:
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: