[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP] * [WIP] splitting sentencepiece and tokenizers dependencies * update dummy objects * add name_or_path to models and tokenizers * prefix added to file names * prefix * styling + quality * spliting all the tokenizer files - sorting sentencepiece based ones * update tokenizer version up to 0.9.0 * remove hard dependency on sentencepiece 🎉 * and removed hard dependency on tokenizers 🎉 * update conversion script * update missing models * fixing tests * move test_tokenization_fast to main tokenization tests - fix bugs * bump up tokenizers * fix bert_generation * update ad fix several tokenizers * keep sentencepiece in deps for now * fix funnel and deberta tests * fix fsmt * fix marian tests * fix layoutlm * fix squeezebert and gpt2 * fix T5 tokenization * fix xlnet tests * style * fix mbart * bump up tokenizers to 0.9.2 * fix model tests * fix tf models * fix seq2seq examples * fix tests without sentencepiece * fix slow => fast conversion without sentencepiece * update auto and bert generation tests * fix mbart tests * fix auto and common test without tokenizers * fix tests without tokenizers * clean up tests lighten up when tokenizers + sentencepiece are both off * style quality and tests fixing * add sentencepiece to doc/examples reqs * leave sentencepiece on for now * style quality split hebert and fix pegasus * WIP Herbert fast * add sample_text_no_unicode and fix hebert tokenization * skip FSMT example test for now * fix style * fix fsmt in example tests * update following Lysandre and Sylvain's comments * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)
* splitting fast and slow tokenizers [WIP] * [WIP] splitting sentencepiece and tokenizers dependencies * update dummy objects * add name_or_path to models and tokenizers * prefix added to file names * prefix * styling + quality * spliting all the tokenizer files - sorting sentencepiece based ones * update tokenizer version up to 0.9.0 * remove hard dependency on sentencepiece 🎉 * and removed hard dependency on tokenizers 🎉 * update conversion script * update missing models * fixing tests * move test_tokenization_fast to main tokenization tests - fix bugs * bump up tokenizers * fix bert_generation * update ad fix several tokenizers * keep sentencepiece in deps for now * fix funnel and deberta tests * fix fsmt * fix marian tests * fix layoutlm * fix squeezebert and gpt2 * fix T5 tokenization * fix xlnet tests * style * fix mbart * bump up tokenizers to 0.9.2 * fix model tests * fix tf models * fix seq2seq examples * fix tests without sentencepiece * fix slow => fast conversion without sentencepiece * update auto and bert generation tests * fix mbart tests * fix auto and common test without tokenizers * fix tests without tokenizers * clean up tests lighten up when tokenizers + sentencepiece are both off * style quality and tests fixing * add sentencepiece to doc/examples reqs * leave sentencepiece on for now * style quality split hebert and fix pegasus * WIP Herbert fast * add sample_text_no_unicode and fix hebert tokenization * skip FSMT example test for now * fix style * fix fsmt in example tests * update following Lysandre and Sylvain's comments * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
ba8c4d0a · Thomas Wolf · GitHub · c65863ce · ba8c4d0a · ba8c4d0a
Unverified Commit ba8c4d0a authored Oct 18, 2020 by Thomas Wolf Committed by GitHub Oct 18, 2020
20 changed files
--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -19,12 +19,11 @@ import json
 import os
 import warnings
 from functools import lru_cache
+from typing import Optional, Tuple

 import regex as re

 from .tokenization_utils import AddedToken, PreTrainedTokenizer
-from .tokenization_utils_base import BatchEncoding
-from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging


@@ -256,22 +255,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text

-    def save_vocabulary(self, save_directory):
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (:obj:`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )

        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, ensure_ascii=False))
@@ -303,114 +296,3 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        if is_split_into_words or add_prefix_space:
            text = " " + text
        return (text, kwargs)
-
-
-class GPT2TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
-    Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ::
-
-        >>> from transformers import GPT2TokenizerFast
-        >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-        >>> tokenizer("Hello world")['input_ids']
-        [15496, 995]
-        >>> tokenizer(" Hello world")['input_ids']
-        [18435, 995]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    .. note::
-
-        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
-        ``add_prefix_space=True``.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
-    methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            Path to the vocabulary file.
-        merges_file (:obj:`str`):
-            Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
-            The beginning of sequence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
-            The end of sequence token.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
-        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["attention_mask"]
-    slow_tokenizer_class = GPT2Tokenizer
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        unk_token="<|endoftext|>",
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        add_prefix_space=False,
-        **kwargs
-    ):
-        super().__init__(
-            vocab_file,
-            merges_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-        self.add_prefix_space = add_prefix_space
-
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-        else:
-            is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._encode_plus(*args, **kwargs)
--- a/src/transformers/tokenization_gpt2_fast.py
+++ b/src/transformers/tokenization_gpt2_fast.py
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+
+import json
+import warnings
+from typing import Optional, Tuple
+
+from tokenizers import pre_tokenizers
+
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_utils_base import BatchEncoding
+from .tokenization_utils_fast import PreTrainedTokenizerFast
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
+        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
+        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
+    },
+    "merges_file": {
+        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
+        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
+        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
+    },
+    "tokenizer_file": {
+        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tokenizer.json",
+        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tokenizer.json",
+        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tokenizer.json",
+        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-tokenizer.json",
+        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "gpt2": 1024,
+    "gpt2-medium": 1024,
+    "gpt2-large": 1024,
+    "gpt2-xl": 1024,
+    "distilgpt2": 1024,
+}
+
+
+class GPT2TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ::
+
+        >>> from transformers import GPT2TokenizerFast
+        >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+        >>> tokenizer("Hello world")['input_ids']
+        [15496, 995]
+        >>> tokenizer(" Hello world")['input_ids']
+        [18435, 995]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
+        ``add_prefix_space=True``.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The end of sequence token.
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
+        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = GPT2Tokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
+        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
+            pre_tok_state["add_prefix_space"] = add_prefix_space
+            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
+
+        self.add_prefix_space = add_prefix_space
+
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+        else:
+            is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
--- a/src/transformers/tokenization_herbert.py
+++ b/src/transformers/tokenization_herbert.py
@@ -13,10 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Optional
-
 from .tokenization_bert import BasicTokenizer
-from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .tokenization_xlm import XLMTokenizer
 from .utils import logging

@@ -28,6 +25,14 @@ VOCAB_FILES_NAMES = {
    "merges_file": "merges.txt",
 }

+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"},
+    "merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
+PRETRAINED_INIT_CONFIGURATION = {}
+

 class HerbertTokenizer(XLMTokenizer):
    """
@@ -45,6 +50,9 @@ class HerbertTokenizer(XLMTokenizer):
    """

    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(self, **kwargs):

@@ -71,127 +79,3 @@ class HerbertTokenizer(XLMTokenizer):
                split_tokens.extend([t for t in self.bpe(token).split(" ")])

        return split_tokens
-
-
-class HerbertTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library).
-
-    Peculiarities:
-
-    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation.
-      Each occurence of a punctuation character will be treated separately.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-    should refer to the superclass for more information regarding methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            Path to the vocabulary file.
-        merges_file (:obj:`str`):
-            Path to the merges file.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = HerbertTokenizer
-
-    def __init__(self, vocab_file, merges_file, **kwargs):
-
-        kwargs["cls_token"] = "<s>"
-        kwargs["unk_token"] = "<unk>"
-        kwargs["pad_token"] = "<pad>"
-        kwargs["mask_token"] = "<mask>"
-        kwargs["sep_token"] = "</s>"
-
-        super().__init__(
-            vocab_file,
-            merges_file,
-            **kwargs,
-        )
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        An HerBERT, like BERT sequence has the following format:
-
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s> B </s>``
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
-        HerBERT, like BERT sequence pair mask has the following format:
-
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
--- a/src/transformers/tokenization_herbert_fast.py
+++ b/src/transformers/tokenization_herbert_fast.py
+# coding=utf-8
+# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple
+
+from .tokenization_herbert import (
+    PRETRAINED_INIT_CONFIGURATION,
+    PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
+    PRETRAINED_VOCAB_FILES_MAP,
+    HerbertTokenizer,
+)
+from .tokenization_utils_fast import PreTrainedTokenizerFast
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+class HerbertTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library).
+
+    Peculiarities:
+
+    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation.
+      Each occurence of a punctuation character will be treated separately.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = HerbertTokenizer
+
+    def __init__(self, vocab_file, merges_file, tokenizer_file=None, **kwargs):
+
+        kwargs["cls_token"] = "<s>"
+        kwargs["unk_token"] = "<unk>"
+        kwargs["pad_token"] = "<pad>"
+        kwargs["mask_token"] = "<mask>"
+        kwargs["sep_token"] = "</s>"
+
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            **kwargs,
+        )
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        An HerBERT, like BERT sequence has the following format:
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        HerBERT, like BERT sequence pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
--- a/src/transformers/tokenization_layoutlm.py
+++ b/src/transformers/tokenization_layoutlm.py
@@ -15,7 +15,7 @@
 """ Tokenization class for model LayoutLM."""


-from .tokenization_bert import BertTokenizer, BertTokenizerFast
+from .tokenization_bert import BertTokenizer
 from .utils import logging


@@ -58,21 +58,3 @@ class LayoutLMTokenizer(BertTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-
-class LayoutLMTokenizerFast(BertTokenizerFast):
-    r"""
-    Constructs a  "Fast" LayoutLMTokenizer.
-
-    :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
-    tokenization: punctuation splitting + wordpiece.
-
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    model_input_names = ["attention_mask"]
--- a/src/transformers/tokenization_layoutlm_fast.py
+++ b/src/transformers/tokenization_layoutlm_fast.py
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model LayoutLM."""
+
+
+from .tokenization_bert_fast import BertTokenizerFast
+from .tokenization_layoutlm import LayoutLMTokenizer
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/layoutlm-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        "microsoft/layoutlm-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    },
+    "tokenizer_file": {
+        "microsoft/layoutlm-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json",
+        "microsoft/layoutlm-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tokenizer.json",
+    },
+}
+
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/layoutlm-base-uncased": 512,
+    "microsoft/layoutlm-large-uncased": 512,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/layoutlm-base-uncased": {"do_lower_case": True},
+    "microsoft/layoutlm-large-uncased": {"do_lower_case": True},
+}
+
+
+class LayoutLMTokenizerFast(BertTokenizerFast):
+    r"""
+    Constructs a  "Fast" LayoutLMTokenizer.
+
+    :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = LayoutLMTokenizer
--- a/src/transformers/tokenization_longformer.py
+++ b/src/transformers/tokenization_longformer.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
+from .tokenization_roberta import RobertaTokenizer
 from .utils import logging


@@ -54,19 +54,3 @@ class LongformerTokenizer(RobertaTokenizer):
        "vocab_file": {m: vocab_url for m in _all_longformer_models},
        "merges_file": {m: merges_url for m in _all_longformer_models},
    }
-
-
-class LongformerTokenizerFast(RobertaTokenizerFast):
-    r"""
-    Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
-
-    :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
-    to the superclass for usage examples and documentation concerning parameters.
-    """
-    # merges and vocab same as Roberta
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_vocab_files_map = {
-        "vocab_file": {m: vocab_url for m in _all_longformer_models},
-        "merges_file": {m: merges_url for m in _all_longformer_models},
-    }
-    slow_tokenizer_class = LongformerTokenizer
--- a/src/transformers/tokenization_longformer_fast.py
+++ b/src/transformers/tokenization_longformer_fast.py
+# coding=utf-8
+# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tokenization_longformer import LongformerTokenizer
+from .tokenization_roberta_fast import RobertaTokenizerFast
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+# vocab and merges same as roberta
+vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
+merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
+tokenizer_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tokenizer.json"
+_all_longformer_models = [
+    "allenai/longformer-base-4096",
+    "allenai/longformer-large-4096",
+    "allenai/longformer-large-4096-finetuned-triviaqa",
+    "allenai/longformer-base-4096-extra.pos.embd.only",
+    "allenai/longformer-large-4096-extra.pos.embd.only",
+]
+
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "allenai/longformer-base-4096": 4096,
+    "allenai/longformer-large-4096": 4096,
+    "allenai/longformer-large-4096-finetuned-triviaqa": 4096,
+    "allenai/longformer-base-4096-extra.pos.embd.only": 4096,
+    "allenai/longformer-large-4096-extra.pos.embd.only": 4096,
+}
+
+
+class LongformerTokenizerFast(RobertaTokenizerFast):
+    r"""
+    Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
+    to the superclass for usage examples and documentation concerning parameters.
+    """
+    # merges and vocab same as Roberta
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = {
+        "vocab_file": {m: vocab_url for m in _all_longformer_models},
+        "merges_file": {m: merges_url for m in _all_longformer_models},
+        "tokenizer_file": {m: tokenizer_url for m in _all_longformer_models},
+    }
+    slow_tokenizer_class = LongformerTokenizer
--- a/src/transformers/tokenization_lxmert.py
+++ b/src/transformers/tokenization_lxmert.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .tokenization_bert import BertTokenizer, BertTokenizerFast
+from .tokenization_bert import BertTokenizer


 ####################################################
@@ -63,20 +63,3 @@ class LxmertTokenizer(BertTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-
-
-class LxmertTokenizerFast(BertTokenizerFast):
-    r"""
-    Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
-
-    :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
-    end-to-end tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
-    parameters.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    slow_tokenizer_class = LxmertTokenizer
--- a/src/transformers/tokenization_lxmert_fast.py
+++ b/src/transformers/tokenization_lxmert_fast.py
+# coding=utf-8
+# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tokenization_bert_fast import BertTokenizerFast
+from .tokenization_lxmert import LxmertTokenizer
+
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to file names for serializing Tokenizer instances
+####################################################
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to pretrained vocabulary URL for all the model shortcut names.
+####################################################
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "unc-nlp/lxmert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    },
+    "tokenizer_file": {
+        "unc-nlp/lxmert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json",
+    },
+}
+
+####################################################
+# Mapping from model shortcut names to max length of inputs
+####################################################
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "unc-nlp/lxmert-base-uncased": 512,
+}
+####################################################
+# Mapping from model shortcut names to a dictionary of additional
+# keyword arguments for Tokenizer `__init__`.
+# To be used for checkpoint specific configurations.
+####################################################
+PRETRAINED_INIT_CONFIGURATION = {
+    "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
+}
+
+
+class LxmertTokenizerFast(BertTokenizerFast):
+    r"""
+    Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = LxmertTokenizer
--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@@ -18,6 +18,19 @@ vocab_files_names = {
    "vocab": "vocab.json",
    "tokenizer_config_file": "tokenizer_config.json",
 }
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"},
+    "target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"},
+    "vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"},
+    "tokenizer_config_file": {
+        "Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json"
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512}
+PRETRAINED_INIT_CONFIGURATION = {}
+
 # Example URL https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/vocab.json


@@ -63,6 +76,9 @@ class MarianTokenizer(PreTrainedTokenizer):
    """

    vocab_files_names = vocab_files_names
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["attention_mask"]
    language_code_re = re.compile(">>.+<<")  # type: re.Pattern

@@ -189,27 +205,22 @@ class MarianTokenizer(PreTrainedTokenizer):
    def vocab_size(self) -> int:
        return len(self.encoder)

-    def save_vocabulary(self, save_directory: str) -> Tuple[str]:
-        """
-        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
-
-        Args:
-            save_directory (:obj:`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        save_dir = Path(save_directory)
        assert save_dir.is_dir(), f"{save_directory} should be a directory"
-        save_json(self.encoder, save_dir / self.vocab_files_names["vocab"])
+        save_json(
+            self.encoder,
+            save_dir / ((filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab"]),
+        )

        for orig, f in zip(["source.spm", "target.spm"], self.spm_files):
-            dest_path = save_dir / Path(f).name
+            dest_path = save_dir / ((filename_prefix + "-" if filename_prefix else "") + Path(f).name)
            if not dest_path.exists():
                copyfile(f, save_dir / orig)

-        return tuple(save_dir / f for f in self.vocab_files_names)
+        return tuple(
+            save_dir / ((filename_prefix + "-" if filename_prefix else "") + f) for f in self.vocab_files_names
+        )

    def get_vocab(self) -> Dict:
        vocab = self.encoder.copy()

--- a/src/transformers/tokenization_mbart.py
+++ b/src/transformers/tokenization_mbart.py
--- a/src/transformers/tokenization_mbart_fast.py
+++ b/src/transformers/tokenization_mbart_fast.py
--- a/src/transformers/tokenization_mobilebert.py
+++ b/src/transformers/tokenization_mobilebert.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """Tokenization classes for MobileBERT."""

-from .tokenization_bert import BertTokenizer, BertTokenizerFast
+from .tokenization_bert import BertTokenizer
 from .utils import logging


@@ -27,7 +27,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
    }
 }

-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}


 PRETRAINED_INIT_CONFIGURATION = {}
@@ -48,21 +48,3 @@ class MobileBertTokenizer(BertTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-
-
-class MobileBertTokenizerFast(BertTokenizerFast):
-    r"""
-    Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
-
-    :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
-    end-to-end tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    slow_tokenizer_class = MobileBertTokenizer
--- a/src/transformers/tokenization_mobilebert_fast.py
+++ b/src/transformers/tokenization_mobilebert_fast.py
+# coding=utf-8
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MobileBERT."""
+
+from .tokenization_bert_fast import BertTokenizerFast
+from .tokenization_mobilebert import MobileBertTokenizer
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "mobilebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/google/mobilebert-uncased/vocab.txt"
+    },
+    "tokenizer_file": {
+        "mobilebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/google/mobilebert-uncased/tokenizer.json"
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
+
+
+PRETRAINED_INIT_CONFIGURATION = {}
+
+
+class MobileBertTokenizerFast(BertTokenizerFast):
+    r"""
+    Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = MobileBertTokenizer
--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
--- a/src/transformers/tokenization_openai_fast.py
+++ b/src/transformers/tokenization_openai_fast.py
--- a/src/transformers/tokenization_pegasus.py
+++ b/src/transformers/tokenization_pegasus.py
--- a/src/transformers/tokenization_pegasus_fast.py
+++ b/src/transformers/tokenization_pegasus_fast.py
--- a/src/transformers/tokenization_phobert.py
+++ b/src/transformers/tokenization_phobert.py
@@ -19,7 +19,7 @@
 import os
 import re
 from shutil import copyfile
-from typing import List, Optional
+from typing import List, Optional, Tuple

 from .tokenization_utils import PreTrainedTokenizer
 from .utils import logging
@@ -311,22 +311,16 @@ class PhobertTokenizer(PreTrainedTokenizer):
        out_string = " ".join(tokens).replace("@@ ", "").strip()
        return out_string

-    def save_vocabulary(self, save_directory):
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (:obj:`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-        out_merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        out_merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)