Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141) * [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...
Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141) * [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
9aeacb58 · Thomas Wolf · GitHub · 4d04120c · 9aeacb58 · 9aeacb58
Unverified Commit 9aeacb58 authored Oct 08, 2020 by Thomas Wolf Committed by GitHub Oct 08, 2020
20 changed files
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -46,13 +46,6 @@ TransfoXLTokenizer
    :members: save_vocabulary


-TransfoXLTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLTokenizerFast
-    :members:
-
-
 TransfoXL specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


--- a/setup.py
+++ b/setup.py
@@ -111,7 +111,7 @@ setup(
    packages=find_packages("src"),
    install_requires=[
        "numpy",
-        "tokenizers == 0.8.1.rc2",
+        "tokenizers == 0.9.0.rc2",
        # dataclasses for Python versions that don't have it
        "dataclasses;python_version<'3.7'",
        # utilities from PyPA to e.g. compare versions
@@ -124,8 +124,9 @@ setup(
        "tqdm >= 4.27",
        # for OpenAI GPT
        "regex != 2019.12.17",
-        # for XLNet
+        # for SentencePiece models
        "sentencepiece != 0.1.92",
+        "protobuf",
        # for XLM
        "sacremoses",
    ],

--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -152,7 +152,7 @@ from .pipelines import (
 from .retrieval_rag import RagRetriever

 # Tokenizers
-from .tokenization_albert import AlbertTokenizer
+from .tokenization_albert import AlbertTokenizer, AlbertTokenizerFast
 from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from .tokenization_bart import BartTokenizer, BartTokenizerFast
 from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer
@@ -160,7 +160,7 @@ from .tokenization_bert_generation import BertGenerationTokenizer
 from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
 from .tokenization_bertweet import BertweetTokenizer
 from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer
-from .tokenization_camembert import CamembertTokenizer
+from .tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_deberta import DebertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
@@ -180,18 +180,18 @@ from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast
 from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
 from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
-from .tokenization_mbart import MBartTokenizer
+from .tokenization_mbart import MBartTokenizer, MBartTokenizerFast
 from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
-from .tokenization_pegasus import PegasusTokenizer
+from .tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
 from .tokenization_phobert import PhobertTokenizer
 from .tokenization_rag import RagTokenizer
-from .tokenization_reformer import ReformerTokenizer
+from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast
 from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast
 from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast
-from .tokenization_t5 import T5Tokenizer
-from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
+from .tokenization_t5 import T5Tokenizer, T5TokenizerFast
+from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_utils_base import (
    BatchEncoding,
@@ -203,8 +203,8 @@ from .tokenization_utils_base import (
 )
 from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .tokenization_xlm import XLMTokenizer
-from .tokenization_xlm_roberta import XLMRobertaTokenizer
-from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast
+from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast

 # Trainer
 from .trainer_callback import (

--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
--- a/src/transformers/tokenization_albert.py
+++ b/src/transformers/tokenization_albert.py
@@ -21,6 +21,7 @@ from shutil import copyfile
 from typing import List, Optional

 from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging


@@ -340,3 +341,206 @@ class AlbertTokenizer(PreTrainedTokenizer):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)
+
+
+class AlbertTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on
+    `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to keep accents when tokenizing.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = AlbertTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        An ALBERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        An ALBERT sequence pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -56,14 +56,14 @@ from .configuration_auto import (
    replace_list_option_in_docstrings,
 )
 from .configuration_utils import PretrainedConfig
-from .tokenization_albert import AlbertTokenizer
+from .tokenization_albert import AlbertTokenizer, AlbertTokenizerFast
 from .tokenization_bart import BartTokenizer, BartTokenizerFast
 from .tokenization_bert import BertTokenizer, BertTokenizerFast
 from .tokenization_bert_generation import BertGenerationTokenizer
 from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_bertweet import BertweetTokenizer
 from .tokenization_blenderbot import BlenderbotSmallTokenizer
-from .tokenization_camembert import CamembertTokenizer
+from .tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_deberta import DebertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
@@ -77,21 +77,21 @@ from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast
 from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
 from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
 from .tokenization_marian import MarianTokenizer
-from .tokenization_mbart import MBartTokenizer
+from .tokenization_mbart import MBartTokenizer, MBartTokenizerFast
 from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
-from .tokenization_pegasus import PegasusTokenizer
+from .tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
 from .tokenization_phobert import PhobertTokenizer
 from .tokenization_rag import RagTokenizer
-from .tokenization_reformer import ReformerTokenizer
+from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast
 from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast
 from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast
-from .tokenization_t5 import T5Tokenizer
-from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLTokenizerFast
+from .tokenization_t5 import T5Tokenizer, T5TokenizerFast
+from .tokenization_transfo_xl import TransfoXLTokenizer
 from .tokenization_xlm import XLMTokenizer
-from .tokenization_xlm_roberta import XLMRobertaTokenizer
-from .tokenization_xlnet import XLNetTokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast
+from .tokenization_xlnet import XLNetTokenizer, XLNetTokenizerFast
 from .utils import logging


@@ -101,14 +101,14 @@ logger = logging.get_logger(__name__)
 TOKENIZER_MAPPING = OrderedDict(
    [
        (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
-        (T5Config, (T5Tokenizer, None)),
+        (T5Config, (T5Tokenizer, T5TokenizerFast)),
        (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
        (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
-        (AlbertConfig, (AlbertTokenizer, None)),
-        (CamembertConfig, (CamembertTokenizer, None)),
-        (PegasusConfig, (PegasusTokenizer, None)),
-        (MBartConfig, (MBartTokenizer, None)),
-        (XLMRobertaConfig, (XLMRobertaTokenizer, None)),
+        (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
+        (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
+        (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
+        (MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
+        (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
        (MarianConfig, (MarianTokenizer, None)),
        (BlenderbotConfig, (BlenderbotSmallTokenizer, None)),
        (LongformerConfig, (LongformerTokenizer, None)),
@@ -117,7 +117,7 @@ TOKENIZER_MAPPING = OrderedDict(
        (RobertaConfig, (BertweetTokenizer, None)),
        (RobertaConfig, (PhobertTokenizer, None)),
        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
-        (ReformerConfig, (ReformerTokenizer, None)),
+        (ReformerConfig, (ReformerTokenizer, ReformerTokenizerFast)),
        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
        (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
        (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
@@ -127,15 +127,14 @@ TOKENIZER_MAPPING = OrderedDict(
        (BertConfig, (BertTokenizer, BertTokenizerFast)),
        (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
        (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
-        (TransfoXLConfig, (TransfoXLTokenizer, TransfoXLTokenizerFast)),
-        (XLNetConfig, (XLNetTokenizer, None)),
+        (TransfoXLConfig, (TransfoXLTokenizer, None)),
+        (XLNetConfig, (XLNetTokenizer, XLNetTokenizerFast)),
        (FlaubertConfig, (FlaubertTokenizer, None)),
        (XLMConfig, (XLMTokenizer, None)),
        (CTRLConfig, (CTRLTokenizer, None)),
        (FSMTConfig, (FSMTTokenizer, None)),
        (BertGenerationConfig, (BertGenerationTokenizer, None)),
        (DebertaConfig, (DebertaTokenizer, None)),
-        (LayoutLMConfig, (LayoutLMTokenizer, None)),
        (RagConfig, (RagTokenizer, None)),
    ]
 )

--- a/src/transformers/tokenization_bart.py
+++ b/src/transformers/tokenization_bart.py
@@ -163,6 +163,7 @@ class BartTokenizerFast(RobertaTokenizerFast):
        "vocab_file": {m: vocab_url for m in _all_bart_models},
        "merges_file": {m: merges_url for m in _all_bart_models},
    }
+    slow_tokenizer_class = BartTokenizer

    def prepare_seq2seq_batch(
        self,

--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -20,8 +20,6 @@ import os
 import unicodedata
 from typing import List, Optional

-from tokenizers import BertWordPieceTokenizer
-
 from .tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging
@@ -206,6 +204,10 @@ class BertTokenizer(PreTrainedTokenizer):
            )
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)

+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
    @property
    def vocab_size(self):
        return len(self.vocab)
@@ -329,7 +331,7 @@ class BertTokenizer(PreTrainedTokenizer):

    def save_vocabulary(self, vocab_path):
        """
-        Save the vocabulary (copy original file) and special tokens file to a directory.
+        Save the vocabulary and special tokens file to a directory.

        Args:
            vocab_path (:obj:`str`):
@@ -610,6 +612,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = BertTokenizer

    def __init__(
        self,
@@ -620,31 +623,20 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
-        clean_text=True,
        tokenize_chinese_chars=True,
        strip_accents=None,
-        wordpieces_prefix="##",
        **kwargs
    ):
        super().__init__(
-            BertWordPieceTokenizer(
-                vocab_file=vocab_file,
-                unk_token=unk_token,
-                sep_token=sep_token,
-                cls_token=cls_token,
-                pad_token=pad_token,
-                mask_token=mask_token,
-                clean_text=clean_text,
-                handle_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-                lowercase=do_lower_case,
-                wordpieces_prefix=wordpieces_prefix,
-            ),
+            vocab_file,
+            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
            **kwargs,
        )


--- a/src/transformers/tokenization_bert_japanese.py
+++ b/src/transformers/tokenization_bert_japanese.py
@@ -16,6 +16,7 @@


 import collections
+import copy
 import os
 import unicodedata
 from typing import Optional
@@ -116,6 +117,13 @@ class BertJapaneseTokenizer(BertTokenizer):
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
+            do_lower_case=do_lower_case,
+            do_word_tokenize=do_word_tokenize,
+            do_subword_tokenize=do_subword_tokenize,
+            word_tokenizer_type=word_tokenizer_type,
+            subword_tokenizer_type=subword_tokenizer_type,
+            never_split=never_split,
+            mecab_kwargs=mecab_kwargs,
            **kwargs,
        )
        # ^^ We call the grandparent's init, not the parent's.
@@ -129,6 +137,10 @@ class BertJapaneseTokenizer(BertTokenizer):
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])

        self.do_word_tokenize = do_word_tokenize
+        self.word_tokenizer_type = word_tokenizer_type
+        self.lower_case = do_lower_case
+        self.never_split = never_split
+        self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
        if do_word_tokenize:
            if word_tokenizer_type == "basic":
                self.word_tokenizer = BasicTokenizer(
@@ -142,6 +154,7 @@ class BertJapaneseTokenizer(BertTokenizer):
                raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))

        self.do_subword_tokenize = do_subword_tokenize
+        self.subword_tokenizer_type = subword_tokenizer_type
        if do_subword_tokenize:
            if subword_tokenizer_type == "wordpiece":
                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@@ -150,6 +163,23 @@ class BertJapaneseTokenizer(BertTokenizer):
            else:
                raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))

+    @property
+    def do_lower_case(self):
+        return self.lower_case
+
+    def __getstate__(self):
+        state = dict(self.__dict__)
+        if self.word_tokenizer_type == "mecab":
+            del state["word_tokenizer"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        if self.word_tokenizer_type == "mecab":
+            self.word_tokenizer = MecabTokenizer(
+                do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
+            )
+
    def _tokenize(self, text):
        if self.do_word_tokenize:
            tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)

--- a/src/transformers/tokenization_bertweet.py
+++ b/src/transformers/tokenization_bertweet.py
@@ -129,7 +129,6 @@ class BertweetTokenizer(PreTrainedTokenizer):
        **kwargs
    ):
        super().__init__(
-            max_len=128,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,

--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -22,6 +22,7 @@ from typing import List, Optional
 import sentencepiece as spm

 from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging


@@ -36,7 +37,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
 }

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "camembert-base": None,
+    "camembert-base": 512,
 }

 SHARED_MODEL_IDENTIFIERS = [
@@ -118,7 +119,6 @@ class CamembertTokenizer(PreTrainedTokenizer):
        **kwargs
    ):
        super().__init__(
-            max_len=512,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
@@ -223,6 +223,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
    def vocab_size(self):
        return len(self.fairseq_tokens_to_ids) + len(self.sp_model)

+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
    def _tokenize(self, text):
        return self.sp_model.EncodeAsPieces(text)

@@ -284,3 +289,189 @@ class CamembertTokenizer(PreTrainedTokenizer):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)
+
+
+class CamembertTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
+    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece
+    <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = CamembertTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        An CamemBERT sequence has the following format:
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/src/transformers/tokenization_distilbert.py
+++ b/src/transformers/tokenization_distilbert.py
@@ -87,3 +87,4 @@ class DistilBertTokenizerFast(BertTokenizerFast):
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = DistilBertTokenizer
--- a/src/transformers/tokenization_dpr.py
+++ b/src/transformers/tokenization_dpr.py
@@ -98,6 +98,7 @@ class DPRContextEncoderTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = DPRContextEncoderTokenizer


 class DPRQuestionEncoderTokenizer(BertTokenizer):
@@ -132,6 +133,7 @@ class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = DPRQuestionEncoderTokenizer


 DPRSpanPrediction = collections.namedtuple(
@@ -417,3 +419,4 @@ class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
    max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = DPRReaderTokenizer
--- a/src/transformers/tokenization_electra.py
+++ b/src/transformers/tokenization_electra.py
@@ -80,3 +80,4 @@ class ElectraTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = ElectraTokenizer
--- a/src/transformers/tokenization_fsmt.py
+++ b/src/transformers/tokenization_fsmt.py
@@ -181,6 +181,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
        **kwargs
    ):
        super().__init__(
+            langs=langs,
            unk_token=unk_token,
            bos_token=bos_token,
            sep_token=sep_token,

--- a/src/transformers/tokenization_funnel.py
+++ b/src/transformers/tokenization_funnel.py
@@ -152,6 +152,7 @@ class FunnelTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = FunnelTokenizer
    cls_token_type_id: int = 2

    def __init__(
@@ -217,16 +218,3 @@ class FunnelTokenizerFast(BertTokenizerFast):
        if token_ids_1 is None:
            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def _convert_encoding(self, encoding, **kwargs):
-        # The fast tokenizer doesn't use the function above so we fix the cls token type id when decoding the fast
-        # tokenzier output.
-        encoding_dict = super()._convert_encoding(encoding, **kwargs)
-        if "token_type_ids" in encoding_dict:
-            # Note: we can't assume the <cls> token is in first position because left padding is a thing, hence the
-            # double list comprehension.
-            encoding_dict["token_type_ids"] = [
-                [self.cls_token_type_id if i == self.cls_token_id else t for i, t in zip(input_ids, type_ids)]
-                for input_ids, type_ids in zip(encoding_dict["input_ids"], encoding_dict["token_type_ids"])
-            ]
-        return encoding_dict
--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -21,7 +21,6 @@ import warnings
 from functools import lru_cache

 import regex as re
-from tokenizers import ByteLevelBPETokenizer

 from .tokenization_utils import AddedToken, PreTrainedTokenizer
 from .tokenization_utils_base import BatchEncoding
@@ -360,6 +359,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = GPT2Tokenizer

    def __init__(
        self,
@@ -369,19 +369,15 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
-        trim_offsets=True,
        **kwargs
    ):
        super().__init__(
-            ByteLevelBPETokenizer(
-                vocab_file=vocab_file,
-                merges_file=merges_file,
-                add_prefix_space=add_prefix_space,
-                trim_offsets=trim_offsets,
-            ),
+            vocab_file,
+            merges_file,
+            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
-            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
            **kwargs,
        )
        self.add_prefix_space = add_prefix_space
@@ -409,8 +405,9 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
+        else:
+            is_split_into_words = kwargs.get("is_split_into_words", False)

-        is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."

--- a/src/transformers/tokenization_longformer.py
+++ b/src/transformers/tokenization_longformer.py
@@ -69,3 +69,4 @@ class LongformerTokenizerFast(RobertaTokenizerFast):
        "vocab_file": {m: vocab_url for m in _all_longformer_models},
        "merges_file": {m: merges_url for m in _all_longformer_models},
    }
+    slow_tokenizer_class = LongformerTokenizer
--- a/src/transformers/tokenization_lxmert.py
+++ b/src/transformers/tokenization_lxmert.py
@@ -79,3 +79,4 @@ class LxmertTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = LxmertTokenizer
--- a/src/transformers/tokenization_mbart.py
+++ b/src/transformers/tokenization_mbart.py
@@ -15,10 +15,12 @@

 from typing import List, Optional

+from tokenizers import processors
+
 from .file_utils import add_start_docstrings
 from .tokenization_utils import BatchEncoding
 from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
-from .tokenization_xlm_roberta import XLMRobertaTokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast
 from .utils import logging


@@ -109,6 +111,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
        self._additional_special_tokens = list(self.lang_code_to_id.keys())
        self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))

+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token
+
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
@@ -227,3 +233,185 @@ class MBartTokenizer(XLMRobertaTokenizer):
        self.cur_lang_code = self.lang_code_to_id[lang]
        self.prefix_tokens = []
        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+
+
+class MBartTokenizerFast(XLMRobertaTokenizerFast):
+    """
+    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
+    a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
+
+    Refer to superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning
+    the initialization parameters and other methods.
+
+    .. warning::
+        ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
+        properly.
+
+    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and
+    ``<language code> <tokens> <eos>``` for target language documents.
+
+    Examples::
+
+        >>> from transformers import MBartTokenizerFast
+        >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro')
+        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+        >>> batch: dict = tokenizer.prepare_seq2seq_batch(
+        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
+        ... )
+    """
+
+    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
+    max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
+    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
+    slow_tokenizer_class = MBartTokenizer
+
+    prefix_tokens: List[int] = []
+    suffix_tokens: List[int] = []
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.cur_lang_code = self.convert_tokens_to_ids("en_XX")
+        self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
+
+        self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES})
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens. The special tokens depend on calling set_lang.
+
+        An MBART sequence has the following format, where ``X`` represents the sequence:
+
+        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
+        - ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]``
+
+        BOS is never used.
+        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        src_lang: str = "en_XX",
+        tgt_texts: Optional[List[str]] = None,
+        tgt_lang: str = "ro_RO",
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        truncation: bool = True,
+        padding: str = "longest",
+        return_tensors: str = "pt",
+        **kwargs,
+    ) -> BatchEncoding:
+        if max_length is None:
+            max_length = self.max_len
+        self.set_src_lang_special_tokens(src_lang)
+        model_inputs: BatchEncoding = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+        if tgt_texts is None:
+            return model_inputs
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+        self.set_tgt_lang_special_tokens(tgt_lang)
+
+        labels = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=True,
+            **kwargs,
+        )["input_ids"]
+        model_inputs["labels"] = labels
+        self.set_src_lang_special_tokens(src_lang)  # sets to src_lang
+        return model_inputs
+
+    def set_src_lang_special_tokens(self, src_lang) -> None:
+        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code]."""
+        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    def set_tgt_lang_special_tokens(self, lang: str) -> None:
+        """Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos]."""
+        self.cur_lang_code = self.convert_tokens_to_ids(lang)
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )