Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141) * [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...
Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141) * [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
9aeacb58 · Thomas Wolf · GitHub · 4d04120c · 9aeacb58 · 9aeacb58
Unverified Commit 9aeacb58 authored Oct 08, 2020 by Thomas Wolf Committed by GitHub Oct 08, 2020
20 changed files
--- a/src/transformers/tokenization_mobilebert.py
+++ b/src/transformers/tokenization_mobilebert.py
@@ -65,3 +65,4 @@ class MobileBertTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = MobileBertTokenizer
--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
@@ -19,8 +19,6 @@ import json
 import os
 import re

-from tokenizers import CharBPETokenizer
-
 from .tokenization_bert import BasicTokenizer
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_utils_fast import PreTrainedTokenizerFast
@@ -123,6 +121,10 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

+    @property
+    def do_lower_case(self):
+        return True
+
    @property
    def vocab_size(self):
        return len(self.encoder)
@@ -243,9 +245,8 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
    Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
    the following peculiarities:

-    - lowercases all inputs,
-    - uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
-      :obj:`BasicTokenizer` if not.
+    - lower case all inputs
+    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
@@ -264,10 +265,11 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = OpenAIGPTTokenizer

    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
-        kwargs.setdefault("unk_token", unk_token)
-        super().__init__(
-            CharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, lowercase=True),
-            **kwargs,
-        )
+        super().__init__(vocab_file, merges_file, unk_token=unk_token, **kwargs)
+
+    @property
+    def do_lower_case(self):
+        return True
--- a/src/transformers/tokenization_pegasus.py
+++ b/src/transformers/tokenization_pegasus.py
@@ -15,10 +15,23 @@
 from typing import Dict, List, Optional

 from .file_utils import add_start_docstrings
-from .tokenization_reformer import ReformerTokenizer
+from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast
 from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding


+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "google/pegasus-xsum": 512,
+}
+
+
 class PegasusTokenizer(ReformerTokenizer):
    r"""
    Construct a Pegasus tokenizer.
@@ -31,6 +44,8 @@ class PegasusTokenizer(ReformerTokenizer):
    """
    offset = 103  # entries 2-104 are only used for pretraining
    vocab_files_names = {"vocab_file": "spiece.model"}
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -150,3 +165,85 @@ class PegasusTokenizer(ReformerTokenizer):
        # for k, v in decoder_inputs.items():
        #    model_inputs[f"decoder_{k}"] = v
        return model_inputs
+
+
+class PegasusTokenizerFast(ReformerTokenizerFast):
+    offset = 103  # entries 2-104 are only used for pretraining
+    vocab_files_names = {"vocab_file": "spiece.model"}
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = PegasusTokenizer
+
+    # def num_special_tokens_to_add(self, pair=False):
+    #     """Just EOS"""
+    #     return 1
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
+        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+        assert all_special_ids == set([0, 1])
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [1]
+        else:
+            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """
+        Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
+        - single sequence: ``X </s>``
+        - pair of sequences: ``A B </s>``  (not intended use)
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        return_tensors: str = "pt",
+        truncation=True,
+        padding="longest",
+        **unused,
+    ) -> BatchEncoding:
+        if "" in src_texts:
+            raise ValueError(f"found empty string in src_texts: {src_texts}")
+        tokenizer_kwargs = dict(
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+        )
+        model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
+        if tgt_texts is None:
+            return model_inputs
+        if max_target_length is not None:
+            tokenizer_kwargs["max_length"] = max_target_length
+        # TODO(@sshleifer): maybe tgt_texts = [self.pad_token + t for t in tgt_texts]  # add decoder_start_token_id
+        labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
+        model_inputs["labels"] = labels
+        # for k, v in decoder_inputs.items():
+        #    model_inputs[f"decoder_{k}"] = v
+        return model_inputs
--- a/src/transformers/tokenization_phobert.py
+++ b/src/transformers/tokenization_phobert.py
@@ -126,7 +126,6 @@ class PhobertTokenizer(PreTrainedTokenizer):
        **kwargs
    ):
        super().__init__(
-            max_len=256,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,

--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -19,6 +19,7 @@ import os
 from shutil import copyfile

 from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging


@@ -184,3 +185,72 @@ class ReformerTokenizer(PreTrainedTokenizer):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)
+
+
+class ReformerTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
+    <https://github.com/google/sentencepiece>`__ .
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = ReformerTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        additional_special_tokens=[],
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    def save_vocabulary(self, save_directory):
+        """Save the sentencepiece vocabulary (copy original file) and special tokens file
+        to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/src/transformers/tokenization_retribert.py
+++ b/src/transformers/tokenization_retribert.py
@@ -71,4 +71,5 @@ class RetriBertTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = RetriBertTokenizer
    model_input_names = ["attention_mask"]
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -17,8 +17,6 @@
 import warnings
 from typing import List, Optional

-from tokenizers.processors import RobertaProcessing
-
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_utils import AddedToken
 from .utils import logging
@@ -344,6 +342,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = RobertaTokenizer

    def __init__(
        self,
@@ -358,38 +357,23 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
-        trim_offsets=True,
        **kwargs
    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        kwargs.setdefault("pad_token", pad_token)
-        kwargs.setdefault("sep_token", sep_token)
-        kwargs.setdefault("cls_token", cls_token)
-        kwargs.setdefault("mask_token", mask_token)
-
        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            unk_token=unk_token,
+            vocab_file,
+            merges_file,
+            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
-            trim_offsets=trim_offsets,
            **kwargs,
        )

-        # This will add the necessary special tokens to the vocabulary if needed
-        self.sanitize_special_tokens()
-
-        self.backend_tokenizer._tokenizer.post_processor = RobertaProcessing(
-            sep=(sep_token, self.sep_token_id),
-            cls=(cls_token, self.cls_token_id),
-            add_prefix_space=add_prefix_space,
-            trim_offsets=trim_offsets,
-        )
-
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:

--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -24,6 +24,7 @@ from typing import List, Optional
 from .file_utils import add_start_docstrings
 from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
 from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
+from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging


@@ -322,3 +323,161 @@ class T5Tokenizer(PreTrainedTokenizer):
        )
        model_inputs["labels"] = labels_and_decoder_mask["input_ids"]
        return model_inputs
+
+
+class T5TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
+    <https://github.com/google/sentencepiece>`__ .
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (:obj:`int`, `optional`, defaults to 100):
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels.
+            These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
+            Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token
+            in the vocabulary like in T5 preprocessing see `here
+            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = T5Tokenizer
+
+    prefix_tokens: List[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=100,
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            extra_ids=extra_ids,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+        self._extra_ids = extra_ids
+
+    def save_vocabulary(self, save_directory):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A sequence has the following format:
+
+        - single sequence: ``X </s>``
+        - pair of sequences: ``A </s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        token_ids_0 = token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0
+        else:
+            token_ids_1 = token_ids_1 + [self.eos_token_id]
+            return self.prefix_tokens + token_ids_0 + token_ids_1
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        padding: str = "longest",
+        return_tensors: str = None,
+        truncation: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if max_length is None:
+            max_length = self.max_len
+        self.prefix_tokens = []
+        model_inputs = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+        if tgt_texts is None:
+            return model_inputs
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+        # set prefix_tokens for target text
+        self.prefix_tokens = [self.pad_token_id]
+        labels_and_decoder_mask = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=truncation,
+            **kwargs,
+        )
+        model_inputs["labels"] = labels_and_decoder_mask["input_ids"]
+        self.prefix_tokens = []
+        return model_inputs
--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -22,23 +22,15 @@ import glob
 import os
 import pickle
 import re
-import warnings
 from collections import Counter, OrderedDict
-from typing import List, Optional
+from typing import List

 import numpy as np

 import sacremoses as sm
-from tokenizers import Tokenizer
-from tokenizers.implementations import BaseTokenizer
-from tokenizers.models import WordLevel
-from tokenizers.normalizers import Lowercase, Sequence, Strip, unicode_normalizer_from_str
-from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit
-from tokenizers.processors import BertProcessing

 from .file_utils import cached_path, is_torch_available, torch_only_method
 from .tokenization_utils import PreTrainedTokenizer
-from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging


@@ -53,7 +45,6 @@ VOCAB_FILES_NAMES = {
    "pretrained_vocab_file_torch": "vocab.bin",
    "vocab_file": "vocab.txt",
 }
-VOCAB_FILES_NAMES_FAST = {"pretrained_vocab_file": "vocab.json", "vocab_file": "vocab.json"}

 PRETRAINED_VOCAB_FILES_MAP = {
    "pretrained_vocab_file": {
@@ -61,12 +52,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
    }
 }

-PRETRAINED_VOCAB_FILES_MAP_FAST = {
-    "pretrained_vocab_file": {
-        "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.json",
-    }
-}
-
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "transfo-xl-wt103": None,
 }
@@ -240,6 +225,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
        if vocab_file is not None:
            self.build_vocab()

+    @property
+    def do_lower_case(self):
+        return self.lower_case
+
    def _compile_space_around_punctuation_pattern(self):
        look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols)
        look_ahead_to_match_all_except_space = r"(?=[^\s])"
@@ -299,11 +288,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
            :obj:`Tuple(str)`: Paths to the files saved.
        """

-        logger.warning(
-            "Please note you will not be able to load the save vocabulary in"
-            " Rust-based TransfoXLTokenizerFast as they don't share the same structure."
-        )
-
        if os.path.isdir(vocab_path):
            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
        else:
@@ -492,165 +476,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
            return symbols


-class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
-    def __init__(
-        self,
-        vocab_file,
-        delimiter,
-        lowercase,
-        unk_token,
-        eos_token,
-        add_eos=False,
-        add_double_eos=False,
-        normalization: Optional[str] = None,
-    ):
-
-        try:
-            tokenizer = WordLevel(vocab_file, unk_token=unk_token)
-            tokenizer = Tokenizer(tokenizer)
-        except Exception:
-            raise ValueError(
-                "Unable to parse file {}. Unknown format. "
-                "If you tried to load a model saved through TransfoXLTokenizer,"
-                "please note they are not compatible.".format(vocab_file)
-            )
-
-        # Create the correct normalization path
-        normalizer = []
-
-        # Include unicode normalization
-        if normalization:
-            normalizer += [unicode_normalizer_from_str(normalization)]
-
-        # Include case normalization
-        if lowercase:
-            normalizer += [Lowercase()]
-
-        # Strip normalizer at the end
-        normalizer += [Strip(left=True, right=True)]
-
-        if len(normalizer) > 0:
-            tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0]
-
-        # Setup the splitter
-        tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit()
-
-        if add_double_eos:
-            tokenizer.post_processor = BertProcessing(
-                (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))
-            )
-
-        parameters = {
-            "model": "TransfoXLModel",
-            "add_eos": add_eos,
-            "add_double_eos": add_double_eos,
-            "unk_token": unk_token,
-            "eos_token": eos_token,
-            "delimiter": delimiter,
-            "lowercase": lowercase,
-        }
-
-        super().__init__(tokenizer, parameters)
-
-
-class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" Transformer-XL tokenizer (backed by HuggingFace's `tokenizers` library) adapted from Vocab class
-    in `the original code <https://github.com/kimiyoung/transformer-xl>`__. The Transformer-XL tokenizer is a
-    word-level tokenizer (no sub-word tokenization).
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
-    methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        special (:obj:`List[str]`, `optional`):
-            A list of special tokens (to be treated by the original implementation of this tokenizer).
-        min_freq (:obj:`int`, `optional`, defaults to 0):
-            The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
-            will be mapped to :obj:`unk_token`).
-        max_size (:obj:`int`, `optional`):
-            The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
-            after excluding the tokens according to the :obj:`min_freq` rule.
-        lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to lowercase the input when tokenizing.
-        delimiter (:obj:`str`, `optional`):
-            The delimiter used btween tokens.
-        vocab_file (:obj:`str`, `optional`):
-            File containing the vocabulary (from the original implementation).
-        pretrained_vocab_file (:obj:`str`, `optional`):
-            File containing the vocabulary as saved with the :obj:`save_pretrained()` method.
-        never_split (xxx, `optional`):
-            Fill me with intesting stuff.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"<eos>"`):
-            The end of sequence token.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<formula>"]`):
-            A list of additional special tokens (for the HuggingFace functionality).
-        add_eos (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to add the end-of-sentence token.
-        add_double_eos (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to add the end-of-sentence token.
-        normalization (xxx, `optional`):
-            Fill me with intesting stuff.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES_FAST
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = []
-
-    def __init__(
-        self,
-        special=None,
-        min_freq=0,
-        max_size=None,
-        lower_case=False,
-        delimiter=None,
-        vocab_file=None,
-        pretrained_vocab_file=None,
-        never_split=None,
-        unk_token="<unk>",
-        eos_token="<eos>",
-        additional_special_tokens=["<formula>"],
-        add_eos=False,
-        add_double_eos=False,
-        normalization=None,
-        **kwargs
-    ):
-
-        super().__init__(
-            _TransfoXLDelimiterLookupTokenizer(
-                vocab_file=vocab_file or pretrained_vocab_file,
-                delimiter=delimiter,
-                lowercase=lower_case,
-                unk_token=unk_token,
-                eos_token=eos_token,
-                add_eos=add_eos,
-                add_double_eos=add_double_eos,
-                normalization=normalization,
-            ),
-            unk_token=unk_token,
-            eos_token=eos_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        warnings.warn(
-            "The class `TransfoXLTokenizerFast` is deprecated and will be removed in a future version. Please use `TransfoXLTokenizer` with it's enhanced tokenization instead.",
-            FutureWarning,
-        )
-
-    def save_pretrained(self, save_directory):
-        logger.warning(
-            "Please note you will not be able to load the vocabulary in"
-            " Python-based TransfoXLTokenizer as they don't share the same structure."
-        )
-
-        return super().save_pretrained(save_directory)
-
-
 class LMOrderedIterator(object):
    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
        """

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -15,7 +15,6 @@
 """ Tokenization classes for python tokenizers.
    For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
 """
-
 import itertools
 import re
 import unicodedata
@@ -45,6 +44,11 @@ from .utils import logging

 logger = logging.get_logger(__name__)

+# Slow tokenizers are saved in a vocabulary plus three separated files
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+

 def _is_whitespace(char):
    """Checks whether `char` is a whitespace character."""
@@ -190,7 +194,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        tokens_to_add = []
        for token in new_tokens:
            assert isinstance(token, str)
-            if not special_tokens and self.init_kwargs.get("do_lower_case", False):
+            if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
                token = token.lower()
            if (
                token != self.unk_token
@@ -239,6 +243,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        """
        Converts a string in a sequence of tokens, using the tokenizer.

+        Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method
+        won't replace the unknown tokens with the `unk_token` yet (this is done in the `encode()` method)
+
        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
        Takes care of added tokens.

@@ -268,7 +275,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
            logger.warning(f"Keyword arguments {kwargs} not recognized.")

        # TODO: should this be in the base class?
-        if self.init_kwargs.get("do_lower_case", False):
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
            # convert non-special tokens to lowercase
            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
@@ -740,7 +747,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        return " ".join(tokens)

    def decode(
-        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        spaces_between_special_tokens: bool = True,
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary
@@ -755,6 +766,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to clean up the tokenization spaces.
+            spaces_between_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to add spaces around special tokens.
+                The behavior of Fast tokenizers is to have this to :obj:`False`.
+                This is setup to :obj:`True` in slow tokenizers for backward compatibility.

        Returns:
            :obj:`str`: The decoded sentence.
@@ -778,7 +793,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                current_sub_text.append(token)
        if current_sub_text:
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = " ".join(sub_texts)
+
+        if spaces_between_special_tokens:
+            text = " ".join(sub_texts)
+        else:
+            text = "".join(sub_texts)

        if clean_up_tokenization_spaces:
            clean_text = self.clean_up_tokenization(text)

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -646,6 +646,8 @@ class SpecialTokensMixin:
        # which are not yet in the vocabulary. Necesssary for serialization/de-serialization
        # TODO clean this up at some point (probably by sitching to fast tokenizers)
        for key, value in kwargs.items():
+            if value is None:
+                continue
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == "additional_special_tokens":
                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
@@ -778,6 +780,9 @@ class SpecialTokensMixin:

        return self._add_tokens(new_tokens, special_tokens=special_tokens)

+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        raise NotImplementedError
+
    @property
    def bos_token(self) -> str:
        """
@@ -1293,11 +1298,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
    max_model_input_sizes: Dict[str, Optional[int]] = {}
    model_input_names: List[str] = ["token_type_ids", "attention_mask"]
    padding_side: str = "right"
+    slow_tokenizer_class = None

    def __init__(self, **kwargs):
        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
        self.init_inputs = ()
-        self.init_kwargs = kwargs
+        self.init_kwargs = copy.deepcopy(kwargs)

        # For backward compatibility we fallback to set model_max_length from max_len if provided
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
@@ -1311,6 +1317,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

+        self.deprecation_warnings = (
+            {}
+        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
+
        super().__init__(**kwargs)

    @property
@@ -1343,9 +1353,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
    def max_len_single_sentence(self, value) -> int:
        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
-            logger.warning(
-                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
-            )
+            if not self.deprecation_warnings.get("max_len_single_sentence", False):
+                logger.warning(
+                    "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_single_sentence"] = True
        else:
            raise ValueError(
                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
@@ -1355,16 +1367,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
    def max_len_sentences_pair(self, value) -> int:
        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
-            logger.warning(
-                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
-            )
+            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
+                logger.warning(
+                    "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_sentences_pair"] = True
        else:
            raise ValueError(
                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
            )

    @classmethod
-    def from_pretrained(cls, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        r"""
        Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
        a predefined tokenizer.
@@ -1425,10 +1439,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            assert tokenizer.unk_token == '<unk>'

        """
-        return cls._from_pretrained(*inputs, **kwargs)
-
-    @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
@@ -1475,7 +1485,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                    "added_tokens_file": ADDED_TOKENS_FILE,
                    "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                    "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
-                    "full_tokenizer_file": FULL_TOKENIZER_FILE,
+                    "tokenizer_file": FULL_TOKENIZER_FILE,
                }
                # Look for the tokenizer files
                for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
@@ -1541,6 +1551,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            else:
                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))

+        return cls._from_pretrained(
+            resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
+        )
+
+    @classmethod
+    def _from_pretrained(
+        cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
+    ):
+        # We instantiate fast tokenizers based on a slow tokenizer for now
+        # In the future we can also use a direct way based on saving/instantiating
+        # tokenizer's Tokenizer directly from it's serialization JSON
+        if cls.slow_tokenizer_class is not None:
+            slow_tokenizer = cls.slow_tokenizer_class._from_pretrained(
+                copy.deepcopy(resolved_vocab_files),
+                pretrained_model_name_or_path,
+                copy.deepcopy(init_configuration),
+                *init_inputs,
+                **(copy.deepcopy(kwargs)),
+            )
+        else:
+            slow_tokenizer = None
+
        # Prepare tokenizer initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
@@ -1556,6 +1588,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        # Update with newly provided kwargs
        init_kwargs.update(kwargs)

+        # Convert AddedTokens serialized as dict to class instances
+        def convert_added_tokens(obj: Union[AddedToken, Any]):
+            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
+                obj.pop("__type")
+                return AddedToken(**obj)
+            elif isinstance(obj, (list, tuple)):
+                return list(convert_added_tokens(o) for o in obj)
+            elif isinstance(obj, dict):
+                return {k: convert_added_tokens(v) for k, v in obj.items()}
+            return obj
+
+        init_kwargs = convert_added_tokens(init_kwargs)
+
        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
@@ -1570,6 +1615,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path

+        if slow_tokenizer is not None:
+            init_kwargs["__slow_tokenizer"] = slow_tokenizer
+
        # Instantiate tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)
@@ -1580,8 +1628,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            )

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
-        tokenizer.init_inputs = init_inputs
-        tokenizer.init_kwargs = init_kwargs
+        # Removed: Now done at the base class level
+        # tokenizer.init_inputs = init_inputs
+        # tokenizer.init_kwargs = init_kwargs

        # If there is a complementary special token map, load it
        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
@@ -1589,11 +1638,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
                special_tokens_map = json.load(special_tokens_map_handle)

+            special_tokens_map = convert_added_tokens(special_tokens_map)
            for key, value in special_tokens_map.items():
-                if isinstance(value, dict):
-                    value = AddedToken(**value)
-                elif isinstance(value, list):
-                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
                setattr(tokenizer, key, value)

        # Add supplementary tokens.
@@ -1623,14 +1669,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

    def save_pretrained(self, save_directory: str) -> Tuple[str]:
        """
-        Save the tokenizer vocabulary files together with:
+        Save the full tokenizer state.

-            - added tokens,
-            - special tokens to class attributes mapping,
-            - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).

        This method make sure the full tokenizer can then be re-loaded using the
-        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` class method.
+        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
+
+        .. Note::
+            A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
+            this method will not be possible to load back
+            in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
+            in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.

        .. Warning::
           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
@@ -1648,7 +1697,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        os.makedirs(save_directory, exist_ok=True)

        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
-        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)

        tokenizer_config = copy.deepcopy(self.init_kwargs)
@@ -1657,22 +1705,33 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        for file_id in self.vocab_files_names.keys():
            tokenizer_config.pop(file_id, None)

+        # Sanitize AddedTokens
+        def convert_added_tokens(obj: Union[AddedToken, Any]):
+            if isinstance(obj, AddedToken):
+                out = obj.__getstate__()
+                out["__type"] = "AddedToken"
+                return out
+            elif isinstance(obj, (list, tuple)):
+                return list(convert_added_tokens(o) for o in obj)
+            elif isinstance(obj, dict):
+                return {k: convert_added_tokens(v) for k, v in obj.items()}
+            return obj
+
+        tokenizer_config = convert_added_tokens(tokenizer_config)
        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(tokenizer_config, ensure_ascii=False))

+        # Sanitize AddedTokens in special_tokens_map
+        write_dict = convert_added_tokens(self.special_tokens_map_extended)
        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
-            write_dict = {}
-            for key, value in self.special_tokens_map_extended.items():
-                if isinstance(value, AddedToken):
-                    write_dict[key] = value.__getstate__()
-                elif isinstance(value, list):
-                    write_dict[key] = [
-                        token.__getstate__() if isinstance(token, AddedToken) else token for token in value
-                    ]
-                else:
-                    write_dict[key] = value
            f.write(json.dumps(write_dict, ensure_ascii=False))

+        file_names = (tokenizer_config_file, special_tokens_map_file)
+
+        return self._save_pretrained(save_directory, file_names)
+
+    def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]:
+        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
        added_vocab = self.get_added_vocab()
        if added_vocab:
            with open(added_tokens_file, "w", encoding="utf-8") as f:
@@ -1681,7 +1740,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        vocab_files = self.save_vocabulary(save_directory)

-        return vocab_files + (special_tokens_map_file, added_tokens_file)
+        return file_names + (vocab_files, added_tokens_file)

    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,
@@ -1752,13 +1811,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
-                logger.warning(
-                    "Truncation was not explicitely activated but `max_length` is provided a specific value, "
-                    "please use `truncation=True` to explicitely truncate examples to max length. "
-                    "Defaulting to 'longest_first' truncation strategy. "
-                    "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
-                    "more precisely by providing a specific strategy to `truncation`."
-                )
+                if not self.deprecation_warnings.get("Truncation-not-explicitely-activated", False):
+                    logger.warning(
+                        "Truncation was not explicitely activated but `max_length` is provided a specific value, "
+                        "please use `truncation=True` to explicitely truncate examples to max length. "
+                        "Defaulting to 'longest_first' truncation strategy. "
+                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
+                        "more precisely by providing a specific strategy to `truncation`."
+                    )
+                self.deprecation_warnings["Truncation-not-explicitely-activated"] = True
            truncation = "longest_first"

        # Get padding strategy
@@ -1818,10 +1879,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
-                        logger.warning(
-                            "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
-                            "Default to no padding."
-                        )
+                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
+                            logger.warning(
+                                "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no padding."
+                            )
+                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length
@@ -1829,10 +1892,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
-                        logger.warning(
-                            "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
-                            "Default to no truncation."
-                        )
+                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
+                            logger.warning(
+                                "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no truncation."
+                            )
+                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length
@@ -2437,6 +2502,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

+        if return_token_type_ids is not None and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
@@ -2469,7 +2541,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])

        # Build output dictionnary
        encoded_inputs["input_ids"] = sequence
@@ -2483,11 +2555,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        # Check lengths
        if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum sequence length "
-                "for this model ({} > {}). Running this sequence through the model will result in "
-                "indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
-            )
+            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
+                logger.warning(
+                    "Token indices sequence length is longer than the specified maximum sequence length "
+                    "for this model ({} > {}). Running this sequence through the model will result in "
+                    "indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
+                )
+            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True

        # Padding
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
@@ -2703,7 +2777,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        ]

    def decode(
-        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary

--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -16,16 +16,19 @@
    For slow (python) tokenizers see tokenization_utils.py
 """

+import copy
 import os
 import warnings
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Union

 from tokenizers import Encoding as EncodingFast
+from tokenizers import Tokenizer as TokenizerFast
 from tokenizers.decoders import Decoder as DecoderFast
-from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast

+from .convert_slow_tokenizer import convert_slow_tokenizer
 from .file_utils import add_end_docstrings
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_utils_base import (
    INIT_TOKENIZER_DOCSTRING,
    AddedToken,
@@ -44,6 +47,15 @@ from .utils import logging
 logger = logging.get_logger(__name__)


+# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
+TOKENIZER_FILE = "tokenizer.json"
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+
+# Slow tokenizers have an additional addedd tokens files
+ADDED_TOKENS_FILE = "added_tokens.json"
+
+
 @add_end_docstrings(
    INIT_TOKENIZER_DOCSTRING,
    """
@@ -64,12 +76,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
    dictionary structures (BPE, sentencepiece...).
    """

-    def __init__(self, tokenizer: BaseTokenizerFast, **kwargs):
-        if not isinstance(tokenizer, BaseTokenizerFast):
-            raise ValueError(
-                "Tokenizer should be an instance of a BaseTokenizer " "provided by HuggingFace tokenizers library."
-            )
-        self._tokenizer: BaseTokenizerFast = tokenizer
+    slow_tokenizer_class: PreTrainedTokenizer = None
+
+    def __init__(self, *args, **kwargs):
+        # We instantiate fast tokenizers based on a slow tokenizer for now
+        # In the future we'll also use a direct way based on saving/instantiating
+        # tokenizer's Tokenizer directly from it's serialization JSON
+        if "__slow_tokenizer" in kwargs and kwargs["__slow_tokenizer"]:
+            slow_tokenizer = kwargs.pop("__slow_tokenizer")
+        else:
+            slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
+        self._tokenizer = convert_slow_tokenizer(slow_tokenizer)
+
+        kwargs = copy.deepcopy(slow_tokenizer.init_kwargs)

        # We call this after having initialized the backend tokenizer because we update it.
        super().__init__(**kwargs)
@@ -116,7 +135,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        return self._tokenizer.get_vocab_size(with_added_tokens=True)

    @property
-    def backend_tokenizer(self) -> BaseTokenizerFast:
+    def backend_tokenizer(self) -> TokenizerFast:
        """
        :obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        """
@@ -259,6 +278,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        """
        Converts a string in a sequence of tokens, using the backend Rust tokenizer.

+        Note that, unlike slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
+        will replace the unknown tokens with the :obj:`unk_token`.
+
        Args:
            text (:obj:`str`):
                The sequence to be encoded.
@@ -343,7 +365,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
    ) -> BatchEncoding:

        if not isinstance(batch_text_or_text_pairs, list):
-            raise ValueError(
+            raise TypeError(
                "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
            )

@@ -487,7 +509,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        return batched_output

    def decode(
-        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary
@@ -496,7 +522,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.

        Args:
-            token_ids (:obj:`List[int]`):
+            token_ids (:obj:`Union[int, List[int]]`):
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to remove special tokens in the decoding.
@@ -506,6 +532,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        Returns:
            :obj:`str`: The decoded sentence.
        """
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
        text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

        if clean_up_tokenization_spaces:
@@ -520,8 +548,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        and special token mappings.

        .. warning::
-            Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if
-            you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
+            Please use :meth:`~transformers.PreTrainedTokenizerFast.save_pretrained` to save the full tokenizer state if
+            you want to reload it using the :meth:`~transformers.PreTrainedTokenizerFast.from_pretrained` class method.

        Args:
            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
@@ -530,7 +558,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
            A tuple of :obj:`str`: The files saved.
        """
        if os.path.isdir(save_directory):
-            files = self._tokenizer.save_model(save_directory)
+            files = self._tokenizer.model.save(save_directory)
        else:
            folder, file = os.path.split(os.path.abspath(save_directory))
            files = self._tokenizer.save_model(folder, name=file)

--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -648,6 +648,10 @@ class XLMTokenizer(PreTrainedTokenizer):
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

+    @property
+    def do_lower_case(self):
+        return self.do_lowercase_and_remove_accent
+
    def moses_punct_norm(self, text, lang):
        if lang not in self.cache_moses_punct_normalizer:
            punct_normalizer = sm.MosesPunctNormalizer(lang=lang)

--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -20,6 +20,7 @@ from shutil import copyfile
 from typing import List, Optional

 from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .tokenization_xlnet import SPIECE_UNDERLINE
 from .utils import logging

@@ -307,3 +308,190 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)
+
+
+class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
+    :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece
+    <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = XLMRobertaTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/src/transformers/tokenization_xlnet.py
+++ b/src/transformers/tokenization_xlnet.py
@@ -21,6 +21,7 @@ from shutil import copyfile
 from typing import List, Optional

 from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_utils_fast import PreTrainedTokenizerFast
 from .utils import logging


@@ -344,3 +345,213 @@ class XLNetTokenizer(PreTrainedTokenizer):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)
+
+
+class XLNetTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on
+    `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to keep accents when tokenizing.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    padding_side = "left"
+    slow_tokenizer_class = XLNetTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        additional_special_tokens=["<eop>", "<eod>"],
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self._pad_token_type_id = 3
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        An XLNet sequence has the following format:
+
+        - single sequence: ``X <sep> <cls>``
+        - pair of sequences: ``A <sep> B <sep> <cls>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return token_ids_0 + sep + cls
+        return token_ids_0 + sep + token_ids_1 + sep + cls
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
+        return ([0] * len(token_ids_0)) + [1, 1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        An XLNet sequence pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls_segment_id = [2]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0] + cls_segment_id
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
+    def save_vocabulary(self, save_directory):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/src/transformers/utils/sentencepiece_model_pb2.py
+++ b/src/transformers/utils/sentencepiece_model_pb2.py
--- a/tests/test_tokenization_albert.py
+++ b/tests/test_tokenization_albert.py
@@ -17,7 +17,7 @@
 import os
 import unittest

-from transformers.tokenization_albert import AlbertTokenizer
+from transformers.tokenization_albert import AlbertTokenizer, AlbertTokenizerFast

 from .test_tokenization_common import TokenizerTesterMixin

@@ -28,6 +28,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
 class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    tokenizer_class = AlbertTokenizer
+    rust_tokenizer_class = AlbertTokenizerFast
+    test_rust_tokenizer = True

    def setUp(self):
        super().setUp()
@@ -41,6 +43,28 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        output_text = "this is a test"
        return input_text, output_text

+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
    def test_full_tokenizer(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)


--- a/tests/test_tokenization_bart.py
+++ b/tests/test_tokenization_bart.py
@@ -12,6 +12,8 @@ from .test_tokenization_common import TokenizerTesterMixin

 class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = BartTokenizer
+    rust_tokenizer_class = BartTokenizerFast
+    test_rust_tokenizer = True

    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_bert.py
+++ b/tests/test_tokenization_bert.py
@@ -35,7 +35,9 @@ from .test_tokenization_common import TokenizerTesterMixin
 class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    tokenizer_class = BertTokenizer
+    rust_tokenizer_class = BertTokenizerFast
    test_rust_tokenizer = True
+    space_between_special_tokens = True

    def setUp(self):
        super().setUp()
@@ -61,9 +63,6 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

-    def get_rust_tokenizer(self, **kwargs):
-        return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
    def get_input_output_texts(self, tokenizer):
        input_text = "UNwant\u00E9d,running"
        output_text = "unwanted, running"

--- a/tests/test_tokenization_bert_japanese.py
+++ b/tests/test_tokenization_bert_japanese.py
@@ -15,6 +15,7 @@


 import os
+import pickle
 import unittest

 from transformers.testing_utils import custom_tokenizers
@@ -33,6 +34,7 @@ from .test_tokenization_common import TokenizerTesterMixin
 class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    tokenizer_class = BertJapaneseTokenizer
+    space_between_special_tokens = True

    def setUp(self):
        super().setUp()
@@ -87,6 +89,26 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])

+    def test_pickle_mecab_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
+        self.assertIsNotNone(tokenizer)
+
+        text = "こんにちは、世界。\nこんばんは、世界。"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
+
+        filename = os.path.join(self.tmpdirname, "tokenizer.bin")
+        with open(filename, "wb") as handle:
+            pickle.dump(tokenizer, handle)
+
+        with open(filename, "rb") as handle:
+            tokenizer_new = pickle.load(handle)
+
+        tokens_loaded = tokenizer_new.tokenize(text)
+
+        self.assertListEqual(tokens, tokens_loaded)
+
    def test_mecab_tokenizer_ipadic(self):
        tokenizer = MecabTokenizer(mecab_dic="ipadic")