@@ -30,7 +30,7 @@ from .tokenization_distilbert import DistilBertTokenizer
...
@@ -30,7 +30,7 @@ from .tokenization_distilbert import DistilBertTokenizer
logger=logging.getLogger(__name__)
logger=logging.getLogger(__name__)
classAutoTokenizer(object):
classAutoTokenizer(object):
r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
that will be instantiated as one of the tokenizer classes of the library
that will be instantiated as one of the tokenizer classes of the library
when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
class method.
class method.
...
@@ -75,7 +75,7 @@ class AutoTokenizer(object):
...
@@ -75,7 +75,7 @@ class AutoTokenizer(object):
pretrained_model_name_or_path: either:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
cache_dir: (`optional`) string:
cache_dir: (`optional`) string:
...
@@ -90,7 +90,7 @@ class AutoTokenizer(object):
...
@@ -90,7 +90,7 @@ class AutoTokenizer(object):
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
@@ -231,13 +236,13 @@ class PreTrainedTokenizer(object):
...
@@ -231,13 +236,13 @@ class PreTrainedTokenizer(object):
@classmethod
@classmethod
deffrom_pretrained(cls,*inputs,**kwargs):
deffrom_pretrained(cls,*inputs,**kwargs):
r"""
r"""
Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
Args:
Args:
pretrained_model_name_or_path: either:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
cache_dir: (`optional`) string:
cache_dir: (`optional`) string:
...
@@ -252,7 +257,7 @@ class PreTrainedTokenizer(object):
...
@@ -252,7 +257,7 @@ class PreTrainedTokenizer(object):
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
Examples::
Examples::
...
@@ -427,7 +432,7 @@ class PreTrainedTokenizer(object):
...
@@ -427,7 +432,7 @@ class PreTrainedTokenizer(object):
This won't save modifications other than (added tokens and special token mapping) you may have
This won't save modifications other than (added tokens and special token mapping) you may have
applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
"""
"""
ifnotos.path.isdir(save_directory):
ifnotos.path.isdir(save_directory):
logger.error("Saving directory ({}) should be a directory".format(save_directory))
logger.error("Saving directory ({}) should be a directory".format(save_directory))
...
@@ -464,7 +469,7 @@ class PreTrainedTokenizer(object):
...
@@ -464,7 +469,7 @@ class PreTrainedTokenizer(object):
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
and special token mappings.
and special token mappings.
Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
"""
"""
raiseNotImplementedError
raiseNotImplementedError
...
@@ -518,6 +523,30 @@ class PreTrainedTokenizer(object):
...
@@ -518,6 +523,30 @@ class PreTrainedTokenizer(object):
returnlen(to_add_tokens)
returnlen(to_add_tokens)
defnum_added_tokens(self,pair=False):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Note:
This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
inside your training loop.
Args:
pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
number of added tokens in the case of a single sequence if set to False.
Returns:
Number of tokens added to sequences
"""
ifpair:
initial_tokens_len=len(self.encode("This is a sequence")+self.encode("This is another"))
final_tokens_len=len(self.encode("This is a sequence","This is another",add_special_tokens=True))
else:
initial_tokens_len=len(self.encode("This is a sequence"))
final_tokens_len=len(self.encode("This is a sequence",add_special_tokens=True))
returnfinal_tokens_len-initial_tokens_len
defadd_special_tokens(self,special_tokens_dict):
defadd_special_tokens(self,special_tokens_dict):
"""
"""
...
@@ -663,38 +692,185 @@ class PreTrainedTokenizer(object):
...
@@ -663,38 +692,185 @@ class PreTrainedTokenizer(object):