Unverified Commit 2295d783 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Copy tokenizer files in each of their repo (#10624)

* Move tokenizer files in each repo

* Fix mBART50 tests

* Fix mBART tests

* Fix Marian tests

* Update templates
parent d26b37e7
...@@ -20,18 +20,36 @@ from ..roberta.tokenization_roberta import RobertaTokenizer ...@@ -20,18 +20,36 @@ from ..roberta.tokenization_roberta import RobertaTokenizer
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
# vocab and merges same as roberta VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" # See all BART models at https://huggingface.co/models?filter=bart
_all_bart_models = [ PRETRAINED_VOCAB_FILES_MAP = {
"facebook/bart-base", "vocab_file": {
"facebook/bart-large", "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
"facebook/bart-large-mnli", "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
"facebook/bart-large-cnn", "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
"facebook/bart-large-xsum", "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
"yjernite/bart_eli5", "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
# This is not exhaustive: see https://huggingface.co/models?filter=bart "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
] },
"merges_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/bart-base": 1024,
"facebook/bart-large": 1024,
"facebook/bart-large-mnli": 1024,
"facebook/bart-large-cnn": 1024,
"facebook/bart-large-xsum": 1024,
"yjernite/bart_eli5": 1024,
}
class BartTokenizer(RobertaTokenizer): class BartTokenizer(RobertaTokenizer):
...@@ -42,9 +60,6 @@ class BartTokenizer(RobertaTokenizer): ...@@ -42,9 +60,6 @@ class BartTokenizer(RobertaTokenizer):
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods. parameters and other methods.
""" """
# merges and vocab same as Roberta vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = {m: 1024 for m in _all_bart_models} pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map = { max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
"vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models},
}
...@@ -21,19 +21,44 @@ from .tokenization_bart import BartTokenizer ...@@ -21,19 +21,44 @@ from .tokenization_bart import BartTokenizer
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
# vocab and merges same as roberta VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" # See all BART models at https://huggingface.co/models?filter=bart
tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json" PRETRAINED_VOCAB_FILES_MAP = {
_all_bart_models = [ "vocab_file": {
"facebook/bart-base", "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
"facebook/bart-large", "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
"facebook/bart-large-mnli", "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
"facebook/bart-large-cnn", "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
"facebook/bart-large-xsum", "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
"yjernite/bart_eli5", "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
# This is not exhaustive: see https://huggingface.co/models?filter=bart },
] "merges_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
},
"tokenizer_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/tokenizer.json",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/tokenizer.json",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/tokenizer.json",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/tokenizer.json",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/bart-base": 1024,
"facebook/bart-large": 1024,
"facebook/bart-large-mnli": 1024,
"facebook/bart-large-cnn": 1024,
"facebook/bart-large-xsum": 1024,
"yjernite/bart_eli5": 1024,
}
class BartTokenizerFast(RobertaTokenizerFast): class BartTokenizerFast(RobertaTokenizerFast):
...@@ -44,11 +69,7 @@ class BartTokenizerFast(RobertaTokenizerFast): ...@@ -44,11 +69,7 @@ class BartTokenizerFast(RobertaTokenizerFast):
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods. initialization parameters and other methods.
""" """
# merges and vocab same as Roberta vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = {m: 1024 for m in _all_bart_models} pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map = { max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
"vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models},
"tokenizer_file": {m: tokenizer_url for m in _all_bart_models},
}
slow_tokenizer_class = BartTokenizer slow_tokenizer_class = BartTokenizer
...@@ -29,7 +29,13 @@ logger = logging.get_logger(__name__) ...@@ -29,7 +29,13 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
tokenizer_url = "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model" PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"bert_for_seq_generation": "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bert_for_seq_generation": 512}
class BertGenerationTokenizer(PreTrainedTokenizer): class BertGenerationTokenizer(PreTrainedTokenizer):
...@@ -55,8 +61,8 @@ class BertGenerationTokenizer(PreTrainedTokenizer): ...@@ -55,8 +61,8 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = {"vocab_file": {"bert_for_seq_generation": tokenizer_url}} pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = {"bert_for_seq_generation": 512} max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
prefix_tokens: List[int] = [] prefix_tokens: List[int] = []
model_input_names = ["input_ids", "attention_mask"] model_input_names = ["input_ids", "attention_mask"]
......
...@@ -29,9 +29,18 @@ logger = logging.get_logger(__name__) ...@@ -29,9 +29,18 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = { VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json", "vocab_file": "vocab.json",
"merges_file": "merges.txt", "merges_file": "merges.txt",
# "tokenizer_config_file": "tokenizer_config.json", "tokenizer_config_file": "tokenizer_config.json",
} }
CKPT_3B = "facebook/blenderbot-3B"
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
class BlenderbotTokenizer(RobertaTokenizer): class BlenderbotTokenizer(RobertaTokenizer):
...@@ -45,19 +54,9 @@ class BlenderbotTokenizer(RobertaTokenizer): ...@@ -45,19 +54,9 @@ class BlenderbotTokenizer(RobertaTokenizer):
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
parameters. parameters.
""" """
vocab_files_names = { vocab_files_names = VOCAB_FILES_NAMES
"vocab_file": "vocab.json", pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
"merges_file": "merges.txt", max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
"tokenizer_config_file": "tokenizer_config.json",
}
pretrained_vocab_files_map = {
"vocab_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}
max_model_input_sizes = {"facebook/blenderbot-3B": 128}
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None): def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
""" """
......
...@@ -33,6 +33,20 @@ VOCAB_FILES_NAMES = { ...@@ -33,6 +33,20 @@ VOCAB_FILES_NAMES = {
"tokenizer_config_file": "tokenizer_config.json", "tokenizer_config_file": "tokenizer_config.json",
} }
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot_small-90M": 512}
def get_pairs(word): def get_pairs(word):
""" """
...@@ -75,23 +89,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer): ...@@ -75,23 +89,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer` Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
""" """
vocab_files_names = { vocab_files_names = VOCAB_FILES_NAMES
"vocab_file": "vocab.json", pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
"merges_file": "merges.txt", max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
"tokenizer_config": "tokenizer_config.json",
}
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer.json"
},
}
max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}
model_input_names = ["input_ids", "attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
......
...@@ -24,9 +24,23 @@ from .tokenization_blenderbot_small import BlenderbotSmallTokenizer ...@@ -24,9 +24,23 @@ from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {} VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {} PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/blenderbot_small-90M": 512, "facebook/blenderbot_small-90M": 512,
......
...@@ -39,13 +39,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -39,13 +39,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"camembert-base": 512, "camembert-base": 512,
} }
SHARED_MODEL_IDENTIFIERS = [
# Load with
# `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
"Musixmatch/umberto-commoncrawl-cased-v1",
"Musixmatch/umberto-wikipedia-uncased-v1",
]
SPIECE_UNDERLINE = "▁" SPIECE_UNDERLINE = "▁"
......
...@@ -48,13 +48,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -48,13 +48,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"camembert-base": 512, "camembert-base": 512,
} }
SHARED_MODEL_IDENTIFIERS = [
# Load with
# `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
"Musixmatch/umberto-commoncrawl-cased-v1",
"Musixmatch/umberto-wikipedia-uncased-v1",
]
SPIECE_UNDERLINE = "▁" SPIECE_UNDERLINE = "▁"
......
...@@ -24,12 +24,12 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} ...@@ -24,12 +24,12 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", "distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", "distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt", "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", "distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt",
} }
} }
......
...@@ -25,20 +25,20 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso ...@@ -25,20 +25,20 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", "distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", "distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt", "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", "distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json", "distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/tokenizer.json",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json", "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/tokenizer.json", "distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/tokenizer.json",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json", "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json", "distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json",
}, },
} }
......
...@@ -30,32 +30,32 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso ...@@ -30,32 +30,32 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json",
}, },
} }
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json",
}, },
} }
READER_PRETRAINED_VOCAB_FILES_MAP = { READER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json",
}, },
} }
......
...@@ -31,32 +31,32 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso ...@@ -31,32 +31,32 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json",
}, },
} }
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json",
}, },
} }
READER_PRETRAINED_VOCAB_FILES_MAP = { READER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json",
}, },
} }
......
...@@ -36,9 +36,13 @@ VOCAB_FILES_NAMES = { ...@@ -36,9 +36,13 @@ VOCAB_FILES_NAMES = {
} }
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"src_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-src.json"}, "src_vocab_file": {
"tgt_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-tgt.json"}, "stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-src.json"
"merges_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/merges.txt"}, },
"tgt_vocab_file": {
"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-tgt.json"
},
"merges_file": {"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/merges.txt"},
} }
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"stas/tiny-wmt19-en-de": 1024} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"stas/tiny-wmt19-en-de": 1024}
......
...@@ -17,12 +17,7 @@ from typing import List, Optional, Tuple ...@@ -17,12 +17,7 @@ from typing import List, Optional, Tuple
from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging from ...utils import logging
from .tokenization_herbert import ( from .tokenization_herbert import HerbertTokenizer
PRETRAINED_INIT_CONFIGURATION,
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
PRETRAINED_VOCAB_FILES_MAP,
HerbertTokenizer,
)
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
...@@ -32,6 +27,14 @@ VOCAB_FILES_NAMES = { ...@@ -32,6 +27,14 @@ VOCAB_FILES_NAMES = {
"merges_file": "merges.txt", "merges_file": "merges.txt",
} }
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"},
"merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
PRETRAINED_INIT_CONFIGURATION = {}
class HerbertTokenizerFast(PreTrainedTokenizerFast): class HerbertTokenizerFast(PreTrainedTokenizerFast):
""" """
......
...@@ -25,8 +25,8 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} ...@@ -25,8 +25,8 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", "microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt",
} }
} }
......
...@@ -26,12 +26,12 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso ...@@ -26,12 +26,12 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", "microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/tokenizer.json",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json", "microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/tokenizer.json",
}, },
} }
......
...@@ -20,17 +20,24 @@ from ..roberta.tokenization_roberta import RobertaTokenizer ...@@ -20,17 +20,24 @@ from ..roberta.tokenization_roberta import RobertaTokenizer
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
# vocab and merges same as roberta VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merge_file": "merges.txt"}
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" PRETRAINED_VOCAB_FILES_MAP = {
_all_longformer_models = [ "vocab_file": {
"allenai/longformer-base-4096", "allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096", "allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096-finetuned-triviaqa", "allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/vocab.json",
"allenai/longformer-base-4096-extra.pos.embd.only", "allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/vocab.json",
"allenai/longformer-large-4096-extra.pos.embd.only", "allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/vocab.json",
] },
"merge_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/merges.txt",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/merges.txt",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"allenai/longformer-base-4096": 4096, "allenai/longformer-base-4096": 4096,
...@@ -48,9 +55,6 @@ class LongformerTokenizer(RobertaTokenizer): ...@@ -48,9 +55,6 @@ class LongformerTokenizer(RobertaTokenizer):
:class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the
superclass for usage examples and documentation concerning parameters. superclass for usage examples and documentation concerning parameters.
""" """
# merges and vocab same as Roberta vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
}
...@@ -21,18 +21,31 @@ from .tokenization_longformer import LongformerTokenizer ...@@ -21,18 +21,31 @@ from .tokenization_longformer import LongformerTokenizer
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
# vocab and merges same as roberta VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merge_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
_all_longformer_models = [
"allenai/longformer-base-4096",
"allenai/longformer-large-4096",
"allenai/longformer-large-4096-finetuned-triviaqa",
"allenai/longformer-base-4096-extra.pos.embd.only",
"allenai/longformer-large-4096-extra.pos.embd.only",
]
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/vocab.json",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/vocab.json",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/vocab.json",
},
"merge_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/merges.txt",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/merges.txt",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/merges.txt",
},
"tokenizer_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/tokenizer.json",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/tokenizer.json",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/tokenizer.json",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/tokenizer.json",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"allenai/longformer-base-4096": 4096, "allenai/longformer-base-4096": 4096,
...@@ -51,10 +64,7 @@ class LongformerTokenizerFast(RobertaTokenizerFast): ...@@ -51,10 +64,7 @@ class LongformerTokenizerFast(RobertaTokenizerFast):
to the superclass for usage examples and documentation concerning parameters. to the superclass for usage examples and documentation concerning parameters.
""" """
# merges and vocab same as Roberta # merges and vocab same as Roberta
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
"tokenizer_file": {m: tokenizer_url for m in _all_longformer_models},
}
slow_tokenizer_class = LongformerTokenizer slow_tokenizer_class = LongformerTokenizer
...@@ -16,33 +16,18 @@ ...@@ -16,33 +16,18 @@
from ..bert.tokenization_bert import BertTokenizer from ..bert.tokenization_bert import BertTokenizer
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
} }
} }
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"unc-nlp/lxmert-base-uncased": 512, "unc-nlp/lxmert-base-uncased": 512,
} }
####################################################
# Mapping from model ids to a dictionary of additional
# keyword arguments for Tokenizer `__init__`.
# To be used for checkpoint specific configurations.
####################################################
PRETRAINED_INIT_CONFIGURATION = { PRETRAINED_INIT_CONFIGURATION = {
"unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
} }
......
...@@ -17,36 +17,21 @@ from ..bert.tokenization_bert_fast import BertTokenizerFast ...@@ -17,36 +17,21 @@ from ..bert.tokenization_bert_fast import BertTokenizerFast
from .tokenization_lxmert import LxmertTokenizer from .tokenization_lxmert import LxmertTokenizer
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
}, },
"tokenizer_file": { "tokenizer_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/tokenizer.json",
}, },
} }
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"unc-nlp/lxmert-base-uncased": 512, "unc-nlp/lxmert-base-uncased": 512,
} }
####################################################
# Mapping from model ids to a dictionary of additional
# keyword arguments for Tokenizer `__init__`.
# To be used for checkpoint specific configurations.
####################################################
PRETRAINED_INIT_CONFIGURATION = { PRETRAINED_INIT_CONFIGURATION = {
"unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment