Unverified Commit 042a6aa7 authored by Julien Chaumond's avatar Julien Chaumond Committed by GitHub
Browse files

Tokenizers: ability to load from model subfolder (#8586)



* <small>tiny typo</small>

* Tokenizers: ability to load from model subfolder

* use subfolder for local files as well

* Uniformize model shortcut name => model id

* from s3 => from huggingface.co
Co-authored-by: default avatarQuentin Lhoest <lhoest.q@gmail.com>
parent 48395d6b
...@@ -3,11 +3,11 @@ Pretrained models ...@@ -3,11 +3,11 @@ Pretrained models
Here is the full list of the currently provided pretrained models together with a short presentation of each model. Here is the full list of the currently provided pretrained models together with a short presentation of each model.
For a list that includes community-uploaded models, refer to `https://huggingface.co/models For a list that includes all community-uploaded models, refer to `https://huggingface.co/models
<https://huggingface.co/models>`__. <https://huggingface.co/models>`__.
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Architecture | Shortcut name | Details of the model | | Architecture | Model id | Details of the model |
+====================+============================================================+=======================================================================================================================================+ +====================+============================================================+=======================================================================================================================================+
| BERT | ``bert-base-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | BERT | ``bert-base-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on lower-cased English text. | | | | | Trained on lower-cased English text. |
......
...@@ -57,7 +57,8 @@ class ModelArguments: ...@@ -57,7 +57,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
......
...@@ -476,7 +476,7 @@ def main(): ...@@ -476,7 +476,7 @@ def main():
"--cache_dir", "--cache_dir",
default="", default="",
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
"--max_seq_length", "--max_seq_length",
......
...@@ -298,7 +298,7 @@ def main(): ...@@ -298,7 +298,7 @@ def main():
"--cache_dir", "--cache_dir",
default=None, default=None,
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
"--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances." "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
......
...@@ -81,7 +81,8 @@ class ModelArguments: ...@@ -81,7 +81,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
......
...@@ -350,7 +350,7 @@ def main(): ...@@ -350,7 +350,7 @@ def main():
"--cache_dir", "--cache_dir",
default=None, default=None,
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
"--max_seq_length", "--max_seq_length",
......
...@@ -452,7 +452,7 @@ def main(): ...@@ -452,7 +452,7 @@ def main():
"--cache_dir", "--cache_dir",
default="", default="",
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
"--max_seq_length", "--max_seq_length",
......
...@@ -578,7 +578,7 @@ def main(): ...@@ -578,7 +578,7 @@ def main():
"--cache_dir", "--cache_dir",
default="", default="",
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
......
...@@ -76,7 +76,8 @@ class ModelArguments: ...@@ -76,7 +76,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
use_fast_tokenizer: bool = field( use_fast_tokenizer: bool = field(
default=True, default=True,
......
...@@ -74,7 +74,8 @@ class ModelArguments: ...@@ -74,7 +74,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
use_fast_tokenizer: bool = field( use_fast_tokenizer: bool = field(
default=True, default=True,
......
...@@ -76,7 +76,8 @@ class ModelArguments: ...@@ -76,7 +76,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
use_fast_tokenizer: bool = field( use_fast_tokenizer: bool = field(
default=True, default=True,
......
...@@ -64,7 +64,8 @@ class ModelArguments: ...@@ -64,7 +64,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
use_fast_tokenizer: bool = field( use_fast_tokenizer: bool = field(
default=True, default=True,
......
...@@ -236,7 +236,7 @@ class BaseTransformer(pl.LightningModule): ...@@ -236,7 +236,7 @@ class BaseTransformer(pl.LightningModule):
"--cache_dir", "--cache_dir",
default="", default="",
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
"--encoder_layerdrop", "--encoder_layerdrop",
......
...@@ -620,7 +620,7 @@ def main(): ...@@ -620,7 +620,7 @@ def main():
"--cache_dir", "--cache_dir",
default="", default="",
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
"--max_seq_length", "--max_seq_length",
......
...@@ -725,7 +725,7 @@ def main(): ...@@ -725,7 +725,7 @@ def main():
"--cache_dir", "--cache_dir",
default="", default="",
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
......
...@@ -61,7 +61,8 @@ class ModelArguments: ...@@ -61,7 +61,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
......
...@@ -65,7 +65,8 @@ class ModelArguments: ...@@ -65,7 +65,8 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
) )
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
......
...@@ -532,7 +532,7 @@ def main(): ...@@ -532,7 +532,7 @@ def main():
"--cache_dir", "--cache_dir",
default="", default="",
type=str, type=str,
help="Where do you want to store the pre-trained models downloaded from s3", help="Where do you want to store the pre-trained models downloaded from huggingface.co",
) )
parser.add_argument( parser.add_argument(
......
...@@ -51,7 +51,8 @@ class ModelArguments: ...@@ -51,7 +51,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json. # or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
......
...@@ -63,7 +63,8 @@ class ModelArguments: ...@@ -63,7 +63,8 @@ class ModelArguments:
# If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
# or just modify its tokenizer_config.json. # or just modify its tokenizer_config.json.
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment