Copy tokenizer files in each of their repo (#10624)

* Move tokenizer files in each repo * Fix mBART50 tests * Fix mBART tests * Fix Marian tests * Update templates

Copy tokenizer files in each of their repo (#10624)
* Move tokenizer files in each repo * Fix mBART50 tests * Fix mBART tests * Fix Marian tests * Update templates
2295d783 · Sylvain Gugger · GitHub · d26b37e7 · 2295d783 · 2295d783
Unverified Commit 2295d783 authored Mar 10, 2021 by Sylvain Gugger Committed by GitHub Mar 10, 2021
18 changed files
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -37,17 +37,21 @@ VOCAB_FILES_NAMES = {
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/vocab.json",
+        "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/vocab.json",
    },
    "spm_file": {
        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentencepiece.bpe.model",
+        "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/sentencepiece.bpe.model",
    },
    "tokenizer_config_file": {
        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/tokenizer_config.json",
+        "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/tokenizer_config.json",
    },
 }

-ALL_M2M100_MODELS = ["facebook/m2m100_418M", "facebook/m2m100_1.2B"]
-SPM_URL = "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentence.bpe.model"
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/m2m100_418M": 1024,
+}

 # fmt: off
 FAIRSEQ_LANGUAGE_CODES = ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"]
@@ -96,7 +100,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
    """

    vocab_files_names = VOCAB_FILES_NAMES
-    max_model_input_sizes = {m: 1024 for m in ALL_M2M100_MODELS}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    model_input_names = ["input_ids", "attention_mask"]


--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -25,7 +25,7 @@ import sentencepiece
 from ...tokenization_utils import PreTrainedTokenizer


-vocab_files_names = {
+VOCAB_FILES_NAMES = {
    "source_spm": "source.spm",
    "target_spm": "target.spm",
    "vocab": "vocab.json",
@@ -33,11 +33,17 @@ vocab_files_names = {
 }

 PRETRAINED_VOCAB_FILES_MAP = {
-    "source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"},
-    "target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"},
-    "vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"},
+    "source_spm": {
+        "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/source.spm"
+    },
+    "target_spm": {
+        "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/target.spm"
+    },
+    "vocab": {
+        "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json"
+    },
    "tokenizer_config_file": {
-        "Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json"
+        "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/tokenizer_config.json"
    },
 }

@@ -91,7 +97,7 @@ class MarianTokenizer(PreTrainedTokenizer):
        >>> outputs = model(**inputs) should work
    """

-    vocab_files_names = vocab_files_names
+    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -23,8 +23,20 @@ from ..xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer

 logger = logging.get_logger(__name__)

-_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
-SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model",
+        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/mbart-large-en-ro": 1024,
+    "facebook/mbart-large-cc25": 1024,
+}

 FAIRSEQ_LANGUAGE_CODES = [
    "ar_AR",
@@ -78,9 +90,9 @@ class MBartTokenizer(XLMRobertaTokenizer):
        >>> inputs["labels"] = labels["input_ids"]
    """

-    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
-    max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
-    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
+    vocab_files_names = VOCAB_FILES_NAMES
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

    prefix_tokens: List[int] = []
    suffix_tokens: List[int] = []

--- a/src/transformers/models/mbart/tokenization_mbart50.py
+++ b/src/transformers/models/mbart/tokenization_mbart50.py
@@ -30,8 +30,15 @@ SPIECE_UNDERLINE = "▁"

 VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}

-_all_mbart50_models = ["facebook/mbart-large-50-one-to-many-mmt"]
-SPM_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/mbart-large-50-one-to-many-mmt": 1024,
+}

 # fmt: off
 FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]
@@ -83,8 +90,8 @@ class MBart50Tokenizer(PreTrainedTokenizer):
    """

    vocab_files_names = VOCAB_FILES_NAMES
-    max_model_input_sizes = {m: 1024 for m in _all_mbart50_models}
-    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart50_models}}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    model_input_names = ["input_ids", "attention_mask"]

    prefix_tokens: List[int] = []

--- a/src/transformers/models/mbart/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart50_fast.py
@@ -36,9 +36,18 @@ logger = logging.get_logger(__name__)

 VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}

-_all_mbart50_models = ["facebook/mbart-large-50-one-to-many-mmt"]
-SPM_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
-tokenizer_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json"
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model",
+    },
+    "tokenizer_file": {
+        "facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/mbart-large-50-one-to-many-mmt": 1024,
+}

 # fmt: off
 FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]
@@ -91,8 +100,8 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
    """

    vocab_files_names = VOCAB_FILES_NAMES
-    max_model_input_sizes = {m: 1024 for m in _all_mbart50_models}
-    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart50_models}}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = MBart50Tokenizer


--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -32,9 +32,24 @@ else:

 logger = logging.get_logger(__name__)

-_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
-SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
-tokenizer_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json"
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model",
+        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model",
+    },
+    "tokenizer_file": {
+        "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json",
+        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/mbart-large-en-ro": 1024,
+    "facebook/mbart-large-cc25": 1024,
+}

 FAIRSEQ_LANGUAGE_CODES = [
    "ar_AR",
@@ -89,9 +104,9 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
        >>> inputs["labels"] = labels["input_ids"]
    """

-    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
-    max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
-    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
+    vocab_files_names = VOCAB_FILES_NAMES
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    slow_tokenizer_class = MBartTokenizer

    prefix_tokens: List[int] = []

--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -27,28 +27,17 @@ from ...utils import logging

 logger = logging.get_logger(__name__)

-SPIECE_UNDERLINE = "▁"

+SPIECE_UNDERLINE = "▁"

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to file names for serializing Tokenizer instances
-####################################################
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to pretrained vocabulary URL for all the model ids.
-####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model"
    }
 }

-####################################################
-# Mapping from model ids to max length of inputs
-####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/reformer-crime-and-punishment": 524288,
 }

--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -32,19 +32,11 @@ else:

 logger = logging.get_logger(__name__)

-SPIECE_UNDERLINE = "▁"

+SPIECE_UNDERLINE = "▁"

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to file names for serializing Tokenizer instances
-####################################################
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to pretrained vocabulary URL for all the model ids.
-####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model"
@@ -54,9 +46,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
    },
 }

-####################################################
-# Mapping from model ids to max length of inputs
-####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/reformer-crime-and-punishment": 524288,
 }

--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/retribert/tokenization_retribert.py
@@ -24,7 +24,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
-        "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
+        "yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt",
    }
 }


--- a/src/transformers/models/retribert/tokenization_retribert_fast.py
+++ b/src/transformers/models/retribert/tokenization_retribert_fast.py
@@ -25,10 +25,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso

 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
-        "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
+        "yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt",
    },
    "tokenizer_file": {
-        "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
+        "yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/tokenizer.json",
    },
 }


--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -34,16 +34,16 @@ PRETRAINED_VOCAB_FILES_MAP = {
        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
-        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json",
    },
    "merges_file": {
        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
-        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt",
    },
 }


--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -32,24 +32,24 @@ PRETRAINED_VOCAB_FILES_MAP = {
        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
-        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json",
    },
    "merges_file": {
        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
-        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt",
    },
    "tokenizer_file": {
        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json",
        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
-        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/tokenizer.json",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/tokenizer.json",
    },
 }


--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -29,16 +29,8 @@ from ...utils import logging

 logger = logging.get_logger(__name__)

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to file names for serializing Tokenizer instances
-####################################################
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to pretrained vocabulary URL for all the model ids.
-####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model",
@@ -49,9 +41,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
    }
 }

-####################################################
-# Mapping from model ids to max length of inputs
-####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "t5-small": 512,
    "t5-base": 512,

--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -32,16 +32,8 @@ else:

 logger = logging.get_logger(__name__)

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to file names for serializing Tokenizer instances
-####################################################
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}

-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to pretrained vocabulary URL for all the model ids.
-####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model",
@@ -59,9 +51,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
    },
 }

-####################################################
-# Mapping from model ids to max length of inputs
-####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "t5-small": 512,
    "t5-base": 512,

--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -37,6 +37,17 @@ VOCAB_FILES_NAMES = {
    "tokenizer_config_file": "tokenizer_config.json",
 }

+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json",
+    },
+    "tokenizer_config_file": {
+        "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer_config.json",
+    },
+}
+
+# Wav2Vec2 has no max input length
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-base-960h": sys.maxsize}

 WAV2VEC2_KWARGS_DOCSTRING = r"""
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
@@ -98,16 +109,8 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
    """

    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = {
-        "vocab_file": {
-            "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
-        },
-        "tokenizer_config_file": {
-            "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json",
-        },
-    }
-    # Wav2Vec2 has no max input length
-    max_model_input_sizes = {"facebook/wav2vec2-base-960h": sys.maxsize}
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
@@ -65,6 +65,8 @@ from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.ca

 logger = logging.get_logger(__name__)

+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
@@ -93,6 +95,7 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
    parameters.
    """

+    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
@@ -109,9 +112,16 @@ from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.ca

 logger = logging.get_logger(__name__)

-VOCAB_FILES_NAMES = {}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

-PRETRAINED_VOCAB_FILES_MAP = {}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
+    },
+    "tokenizer_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json",
+    },
+}

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "{{cookiecutter.checkpoint_identifier}}": 1024,

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -62,6 +62,8 @@ from ..bart.tokenization_bart import BartTokenizer

 logger = logging.get_logger(__name__)

+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
@@ -90,6 +92,7 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
    parameters.
    """

+    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

@@ -105,9 +108,13 @@ from ...utils import logging

 logger = logging.get_logger(__name__)

-VOCAB_FILES_NAMES = {}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

-PRETRAINED_VOCAB_FILES_MAP = {}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
+    },
+}

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "{{cookiecutter.checkpoint_identifier}}": 1024,

--- a/tests/test_tokenization_marian.py
+++ b/tests/test_tokenization_marian.py
@@ -26,7 +26,7 @@ from transformers.testing_utils import require_sentencepiece


 if is_sentencepiece_available():
-    from transformers.models.marian.tokenization_marian import save_json, vocab_files_names
+    from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json

 from .test_tokenization_common import TokenizerTesterMixin

@@ -50,11 +50,11 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        save_dir = Path(self.tmpdirname)
-        save_json(vocab_tokens, save_dir / vocab_files_names["vocab"])
-        save_json(mock_tokenizer_config, save_dir / vocab_files_names["tokenizer_config_file"])
-        if not (save_dir / vocab_files_names["source_spm"]).exists():
-            copyfile(SAMPLE_SP, save_dir / vocab_files_names["source_spm"])
-            copyfile(SAMPLE_SP, save_dir / vocab_files_names["target_spm"])
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
+        save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])

        tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
        tokenizer.save_pretrained(self.tmpdirname)