Commit 31c23bd5 authored by thomwolf's avatar thomwolf
Browse files

[BIG] pytorch-transformers => transformers

parent 2f071fcb
...@@ -45,7 +45,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -45,7 +45,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class DistilBertTokenizer(BertTokenizer): class DistilBertTokenizer(BertTokenizer):
r""" r"""
Constructs a DistilBertTokenizer. Constructs a DistilBertTokenizer.
:class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
Args: Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file vocab_file: Path to a one-wordpiece-per-line vocabulary file
......
...@@ -236,13 +236,13 @@ class PreTrainedTokenizer(object): ...@@ -236,13 +236,13 @@ class PreTrainedTokenizer(object):
@classmethod @classmethod
def from_pretrained(cls, *inputs, **kwargs): def from_pretrained(cls, *inputs, **kwargs):
r""" r"""
Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
Args: Args:
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
...@@ -257,7 +257,7 @@ class PreTrainedTokenizer(object): ...@@ -257,7 +257,7 @@ class PreTrainedTokenizer(object):
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
Examples:: Examples::
...@@ -432,7 +432,7 @@ class PreTrainedTokenizer(object): ...@@ -432,7 +432,7 @@ class PreTrainedTokenizer(object):
This won't save modifications other than (added tokens and special token mapping) you may have This won't save modifications other than (added tokens and special token mapping) you may have
applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation). applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method. This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Saving directory ({}) should be a directory".format(save_directory)) logger.error("Saving directory ({}) should be a directory".format(save_directory))
...@@ -469,7 +469,7 @@ class PreTrainedTokenizer(object): ...@@ -469,7 +469,7 @@ class PreTrainedTokenizer(object):
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
and special token mappings. and special token mappings.
Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method. Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
""" """
raise NotImplementedError raise NotImplementedError
...@@ -916,7 +916,7 @@ class PreTrainedTokenizer(object): ...@@ -916,7 +916,7 @@ class PreTrainedTokenizer(object):
# To avoid mixing byte-level and unicode for byte-level BPT # To avoid mixing byte-level and unicode for byte-level BPT
# we need to build string separatly for added tokens and byte-level tokens # we need to build string separatly for added tokens and byte-level tokens
# cf. https://github.com/huggingface/pytorch-transformers/issues/1133 # cf. https://github.com/huggingface/transformers/issues/1133
sub_texts = [] sub_texts = []
current_sub_text = [] current_sub_text = []
for token in filtered_tokens: for token in filtered_tokens:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment