Tokenizers: ability to load from model subfolder (#8586)

* <small>tiny typo</small> * Tokenizers: ability to load from model subfolder * use subfolder for local files as well * Uniformize model shortcut name => model id * from s3 => from huggingface.co Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com>

Tokenizers: ability to load from model subfolder (#8586)
* <small>tiny typo</small> * Tokenizers: ability to load from model subfolder * use subfolder for local files as well * Uniformize model shortcut name => model id * from s3 => from huggingface.co Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com>
042a6aa7 · Julien Chaumond · GitHub · 48395d6b · 042a6aa7 · 042a6aa7
Unverified Commit 042a6aa7 authored Nov 17, 2020 by Julien Chaumond Committed by GitHub Nov 17, 2020
20 changed files
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -3,11 +3,11 @@ Pretrained models
 Here is the full list of the currently provided pretrained models together with a short presentation of each model.
-For a list that includes community-uploaded models, refer to `https://huggingface.co/models
+For a list that includes all community-uploaded models, refer to `https://huggingface.co/models
 <https://huggingface.co/models>`__.
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Architecture       | Shortcut name                                              | Details of the model                                                                                                                  |
+| Architecture       | Model id                                                   | Details of the model                                                                                                                  |
 +====================+============================================================+=======================================================================================================================================+
 | BERT               | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                    |                                                            | | Trained on lower-cased English text.                                                                                                |

--- a/examples/adversarial/run_hans.py
+++ b/examples/adversarial/run_hans.py
@@ -57,7 +57,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

--- a/examples/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/bert-loses-patience/run_glue_with_pabee.py
@@ -476,7 +476,7 @@ def main():
        "--cache_dir",
        default="",
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--max_seq_length",

--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@@ -298,7 +298,7 @@ def main():
        "--cache_dir",
        default=None,
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."

--- a/examples/contrib/legacy/run_language_modeling.py
+++ b/examples/contrib/legacy/run_language_modeling.py
@@ -81,7 +81,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

--- a/examples/contrib/mm-imdb/run_mmimdb.py
+++ b/examples/contrib/mm-imdb/run_mmimdb.py
@@ -350,7 +350,7 @@ def main():
        "--cache_dir",
        default=None,
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--max_seq_length",

--- a/examples/deebert/run_glue_deebert.py
+++ b/examples/deebert/run_glue_deebert.py
@@ -452,7 +452,7 @@ def main():
        "--cache_dir",
        default="",
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--max_seq_length",

--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -578,7 +578,7 @@ def main():
        "--cache_dir",
        default="",
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(

--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -76,7 +76,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,

--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -74,7 +74,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,

--- a/examples/language-modeling/run_mlm_wwm.py
+++ b/examples/language-modeling/run_mlm_wwm.py
@@ -76,7 +76,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,

--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@@ -64,7 +64,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,

--- a/examples/lightning_base.py
+++ b/examples/lightning_base.py
@@ -236,7 +236,7 @@ class BaseTransformer(pl.LightningModule):
            "--cache_dir",
            default="",
            type=str,
-            help="Where do you want to store the pre-trained models downloaded from s3",
+            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
        )
        parser.add_argument(
            "--encoder_layerdrop",

--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@@ -620,7 +620,7 @@ def main():
        "--cache_dir",
        default="",
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--max_seq_length",

--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
@@ -725,7 +725,7 @@ def main():
        "--cache_dir",
        default="",
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(

--- a/examples/multiple-choice/run_multiple_choice.py
+++ b/examples/multiple-choice/run_multiple_choice.py
@@ -61,7 +61,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

--- a/examples/multiple-choice/run_tf_multiple_choice.py
+++ b/examples/multiple-choice/run_tf_multiple_choice.py
@@ -65,7 +65,8 @@ class ModelArguments:
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

--- a/examples/question-answering/run_squad.py
+++ b/examples/question-answering/run_squad.py
@@ -532,7 +532,7 @@ def main():
        "--cache_dir",
        default="",
        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(

--- a/examples/question-answering/run_squad_trainer.py
+++ b/examples/question-answering/run_squad_trainer.py
@@ -51,7 +51,8 @@ class ModelArguments:
    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
    # or just modify its tokenizer_config.json.
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -63,7 +63,8 @@ class ModelArguments:
    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
    # or just modify its tokenizer_config.json.
    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )