[BIG] pytorch-transformers => transformers

31c23bd5 · thomwolf · 2f071fcb · 31c23bd5 · 31c23bd5 · 31c23bd5
Commit 31c23bd5 authored Sep 26, 2019 by thomwolf
8 changed files
--- a/pytorch_transformers/tokenization_distilbert.py
+++ b/pytorch_transformers/tokenization_distilbert.py
@@ -45,7 +45,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 class DistilBertTokenizer(BertTokenizer):
    r"""
    Constructs a DistilBertTokenizer.
-    :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece

    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file

--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -236,13 +236,13 @@ class PreTrainedTokenizer(object):
    @classmethod
    def from_pretrained(cls, *inputs, **kwargs):
        r"""
-        Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+        Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.

        Args:
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.

            cache_dir: (`optional`) string:
@@ -257,7 +257,7 @@ class PreTrainedTokenizer(object):

            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.

-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.

        Examples::

@@ -432,7 +432,7 @@ class PreTrainedTokenizer(object):
            This won't save modifications other than (added tokens and special token mapping) you may have
            applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).

-            This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+            This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
        """
        if not os.path.isdir(save_directory):
            logger.error("Saving directory ({}) should be a directory".format(save_directory))
@@ -469,7 +469,7 @@ class PreTrainedTokenizer(object):
        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
            and special token mappings.

-            Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
        """
        raise NotImplementedError

@@ -916,7 +916,7 @@ class PreTrainedTokenizer(object):

        # To avoid mixing byte-level and unicode for byte-level BPT
        # we need to build string separatly for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/pytorch-transformers/issues/1133
+        # cf. https://github.com/huggingface/transformers/issues/1133
        sub_texts = []
        current_sub_text = []
        for token in filtered_tokens:

--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py