Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141) * [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...
Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141) * [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
9aeacb58 · Thomas Wolf · GitHub · 4d04120c · 9aeacb58 · 9aeacb58
Unverified Commit 9aeacb58 authored Oct 08, 2020 by Thomas Wolf Committed by GitHub Oct 08, 2020
20 changed files
--- a/tests/test_tokenization_camembert.py
+++ b/tests/test_tokenization_camembert.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+from transformers.testing_utils import _torch_available
+from transformers.tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
+from .test_tokenization_common import TokenizerTesterMixin
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+FRAMEWORK = "pt" if _torch_available else "tf"
+class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = CamembertTokenizer
+    rust_tokenizer_class = CamembertTokenizerFast
+    test_rust_tokenizer = True
+    def setUp(self):
+        super().setUp()
+        # We have a SentencePiece fixture for testing
+        tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        sequence = "I was born in 92000, and this is falsé."
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -56,7 +56,9 @@ def merge_model_tokenizer_mappings(
 class TokenizerTesterMixin:
    tokenizer_class = None
+    rust_tokenizer_class = None
    test_rust_tokenizer = False
+    space_between_special_tokens = False
    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
@@ -68,12 +70,15 @@ class TokenizerTesterMixin:
        input_txt = self.get_clean_sequence(tokenizer)[0]
        return input_txt, input_txt
-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20) -> Tuple[str, list]:
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
        toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
        toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
        if max_length is not None and len(toks) > max_length:
            toks = toks[:max_length]
+        if min_length is not None and len(toks) < min_length and len(toks) > 0:
+            while len(toks) < min_length:
+                toks = toks + toks
        # toks_str = [t[1] for t in toks]
        toks_ids = [t[0] for t in toks]
@@ -99,7 +104,7 @@ class TokenizerTesterMixin:
        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
-        raise NotImplementedError
+        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
    # def get_input_output_texts(self) -> Tuple[str, str]:
    #     """Feel free to overwrite"""
@@ -118,6 +123,29 @@ class TokenizerTesterMixin:
            for i in range(len(batch_encode_plus_sequences["input_ids"]))
        ]
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        sequence, _ = self.get_input_output_texts(tokenizer)
+        # We don't have an exact equivalence on `tokenize()` between Rust and Slow
+        # Slow tokenizer only split tokens, Rust tokenizers will replace with <unk>
+        # tokens = tokenizer.tokenize(sequence)
+        # rust_tokens = rust_tokenizer.tokenize(sequence)
+        # self.assertListEqual(tokens, rust_tokens)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        ids = tokenizer.encode(sequence, add_special_tokens=True)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=True)
+        self.assertListEqual(ids, rust_ids)
    def test_tokenizers_common_properties(self):
        tokenizers = self.get_tokenizers()
        for tokenizer in tokenizers:
@@ -241,6 +269,9 @@ class TokenizerTesterMixin:
        tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
+                    continue
                special_token = tokenizer.all_special_tokens[0]
                text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
@@ -272,6 +303,9 @@ class TokenizerTesterMixin:
        tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
+                    continue
                special_token = tokenizer.all_special_tokens[0]
                text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
@@ -282,7 +316,7 @@ class TokenizerTesterMixin:
                toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
                added = tokenizer.add_tokens(new_toks)
-                self.assertEqual(added, 4)
+                self.assertIn(added, [2, 4])
                toks = tokenizer.tokenize(text)
                toks2 = tokenizer.tokenize(text2)
@@ -390,12 +424,17 @@ class TokenizerTesterMixin:
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                new_toks = ["[ABC]", "[DEF]"]  # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
+                # new_toks = ["[ABC]", "[DEF]"]  # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
+                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
                tokenizer.add_tokens(new_toks)
-                input = "[ABC] [DEF] [ABC] [DEF]"  # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
+                input = "[ABC][DEF][ABC][DEF]"  # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
+                if self.space_between_special_tokens:
+                    output = "[ABC] [DEF] [ABC] [DEF]"
+                else:
+                    output = input
                encoded = tokenizer.encode(input, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-                self.assertEqual(decoded, input)
+                self.assertIn(decoded, [output, output.lower()])
    def test_pretrained_model_lists(self):
        weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
@@ -447,7 +486,7 @@ class TokenizerTesterMixin:
                sequence = tokenizer.encode(seq_0, add_special_tokens=False)
                total_length = len(sequence)
-                assert total_length > 1, "Issue with the testing sequence, please update it it's too short"
+                assert total_length > 4, "Issue with the testing sequence, please update it it's too short"
                # Test with max model input length
                model_max_length = tokenizer.model_max_length
@@ -546,6 +585,7 @@ class TokenizerTesterMixin:
                model_max_length = tokenizer.model_max_length
                self.assertEqual(model_max_length, 100)
                seq_2 = seq_0 * model_max_length
+                assert len(seq_2) > model_max_length
                sequence1 = tokenizer(seq_1, add_special_tokens=False)
                total_length1 = len(sequence1["input_ids"])
@@ -559,9 +599,9 @@ class TokenizerTesterMixin:
                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
                )
                for padding_state in padding_strategies:
-                    with self.subTest(f"Padding: {padding_state}"):
+                    with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
                        for truncation_state in [True, "longest_first", "only_first"]:
-                            with self.subTest(f"Truncation: {truncation_state}"):
+                            with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
                                output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
                                self.assertEqual(len(output["input_ids"]), model_max_length)
@@ -748,34 +788,47 @@ class TokenizerTesterMixin:
    #             # This is not supported with the Rust tokenizers
    #             # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
-    def test_swap_special_token(self):
+    # def test_swap_special_token(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
+    #     tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
+    #     for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
+    #         with self.subTest(f"{tokenizer.__class__.__name__}"):
-                mask = "<mask>"
+    #             # Our mask token
-                sequence = "Encode this sequence"
+    #             mask = "<mask>"
-                sequence_masked_0 = "Encode <mask> sequence"
+    #             # We take a single word in the middle of the vocabulary
-                sequence_masked_1 = "<mask> this sequence"
+    #             all_tokens = sorted(tokenizer.get_vocab().keys())
+    #             word = tokenizer.decode(tokenizer.encode(all_tokens[len(all_tokens)//2], add_special_tokens=False)[:1])
-                # Add tokens so that masked token isn't split
-                tokenizer.add_tokens(sequence.split())
+    #             sequence_0 = "Encode " + word + " sequence"
-                tokenizer.add_special_tokens({"mask_token": mask})
+    #             sequence_masked_0 = "Encode " + mask + " sequence"
-                mask_ind = tokenizer.convert_tokens_to_ids(mask)
-                encoded = tokenizer.encode(sequence, add_special_tokens=False)
+    #             sequence_1 = word + " this sequence"
+    #             sequence_masked_1 = mask + " this sequence"
-                # Test first masked sequence
-                encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
+    #             # Add tokens so that masked token isn't split
-                mask_loc = encoded_masked.index(mask_ind)
+    #             # tokens = [AddedToken(t, lstrip=True, normalized=False) for t in sequence.split()]
-                encoded_masked[mask_loc] = encoded[mask_loc]
+    #             # tokenizer.add_tokens(tokens)
+    #             tokenizer.add_special_tokens(
-                self.assertEqual(encoded_masked, encoded)
+    #                 {"mask_token": AddedToken(mask, normalized=False)}
+    #             )  # Eat left space on Byte-level BPE tokenizers
-                # Test second masked sequence
+    #             mask_ind = tokenizer.convert_tokens_to_ids(mask)
-                encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
-                mask_loc = encoded_masked.index(mask_ind)
+    #             # Test first masked sequence
-                encoded_masked[mask_loc] = encoded[mask_loc]
+    #             encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
+    #             encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
-                self.assertEqual(encoded_masked, encoded)
+    #             assert len(encoded_masked) == len(encoded_0)
+    #             mask_loc = encoded_masked.index(mask_ind)
+    #             encoded_masked[mask_loc] = encoded_0[mask_loc]
+    #             self.assertEqual(encoded_masked, encoded_0)
+    #             # Test second masked sequence
+    #             encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
+    #             encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
+    #             assert len(encoded_masked) == len(encoded_1)
+    #             mask_loc = encoded_masked.index(mask_ind)
+    #             encoded_masked[mask_loc] = encoded_1[mask_loc]
+    #             self.assertEqual(encoded_masked, encoded_1)
    def test_special_tokens_mask(self):
        tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -919,10 +972,10 @@ class TokenizerTesterMixin:
    def test_padding_to_multiple_of(self):
        tokenizers = self.get_tokenizers()
        for tokenizer in tokenizers:
-            if tokenizer.pad_token is None:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.skipTest("No padding token.")
+                if tokenizer.pad_token is None:
-            else:
+                    self.skipTest("No padding token.")
-                with self.subTest(f"{tokenizer.__class__.__name__}"):
+                else:
                    empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
                    normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
                    for key, value in empty_tokens.items():
@@ -1063,14 +1116,15 @@ class TokenizerTesterMixin:
        tokenizers = self.get_tokenizers(do_lower_case=False)
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                vocab = tokenizer.get_vocab()
+                vocab_dict = tokenizer.get_vocab()
+                self.assertIsInstance(vocab_dict, dict)
+                self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
-                self.assertIsInstance(vocab, dict)
+                vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
                self.assertEqual(len(vocab), len(tokenizer))
                tokenizer.add_tokens(["asdfasdfasdfasdf"])
-                vocab = tokenizer.get_vocab()
+                vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
-                self.assertIsInstance(vocab, dict)
                self.assertEqual(len(vocab), len(tokenizer))
    def test_conversion_reversible(self):
@@ -1079,6 +1133,8 @@ class TokenizerTesterMixin:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
                vocab = tokenizer.get_vocab()
                for word, ind in vocab.items():
+                    if word == tokenizer.unk_token:
+                        continue
                    self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
                    self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)
@@ -1173,12 +1229,13 @@ class TokenizerTesterMixin:
    def test_added_token_serializable(self):
        tokenizers = self.get_tokenizers(do_lower_case=False)
        for tokenizer in tokenizers:
-            new_token = AddedToken("new_token", lstrip=True)
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
-            tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+                new_token = AddedToken("new_token", lstrip=True)
+                tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
-                tokenizer.save_pretrained(tmp_dir_name)
+                    tokenizer.save_pretrained(tmp_dir_name)
-                tokenizer.from_pretrained(tmp_dir_name)
+                    tokenizer.from_pretrained(tmp_dir_name)
    def test_batch_encode_plus_padding(self):
        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
@@ -1243,6 +1300,9 @@ class TokenizerTesterMixin:
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
+                    continue
                # Prepare a sequence from our tokenizer vocabulary
                sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20)
                # sequence = " " + sequence  # To be sure the byte-level tokenizers are feeling good
@@ -1345,12 +1405,14 @@ class TokenizerTesterMixin:
    def test_prepare_for_model(self):
        tokenizers = self.get_tokenizers(do_lower_case=False)
        for tokenizer in tokenizers:
-            string_sequence = "Testing the prepare_for_model method."
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
-            ids = tokenizer.encode(string_sequence, add_special_tokens=False)
+                string_sequence = "Testing the prepare_for_model method."
-            input_dict = tokenizer.encode_plus(string_sequence)
+                ids = tokenizer.encode(string_sequence, add_special_tokens=False)
-            prepared_input_dict = tokenizer.prepare_for_model(ids)
+                prepared_input_dict = tokenizer.prepare_for_model(ids, add_special_tokens=True)
+                input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
-            self.assertEqual(input_dict, prepared_input_dict)
+                self.assertEqual(input_dict, prepared_input_dict)
    def test_batch_encode_plus_overflowing_tokens(self):
        tokenizers = self.get_tokenizers(do_lower_case=False)

--- a/tests/test_tokenization_ctrl.py
+++ b/tests/test_tokenization_ctrl.py
@@ -25,6 +25,7 @@ from .test_tokenization_common import TokenizerTesterMixin
 class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = CTRLTokenizer
+    test_rust_tokenizer = False
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_distilbert.py
+++ b/tests/test_tokenization_distilbert.py
@@ -23,9 +23,8 @@ from .test_tokenization_bert import BertTokenizationTest
 class DistilBertTokenizationTest(BertTokenizationTest):
    tokenizer_class = DistilBertTokenizer
+    rust_tokenizer_class = DistilBertTokenizerFast
-    def get_rust_tokenizer(self, **kwargs):
+    test_rust_tokenizer = True
-        return DistilBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
    @slow
    def test_sequence_builders(self):

--- a/tests/test_tokenization_dpr.py
+++ b/tests/test_tokenization_dpr.py
@@ -32,25 +32,22 @@ from .test_tokenization_bert import BertTokenizationTest
 class DPRContextEncoderTokenizationTest(BertTokenizationTest):
    tokenizer_class = DPRContextEncoderTokenizer
+    rust_tokenizer_class = DPRContextEncoderTokenizerFast
-    def get_rust_tokenizer(self, **kwargs):
+    test_rust_tokenizer = True
-        return DPRContextEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
 class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
    tokenizer_class = DPRQuestionEncoderTokenizer
+    rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
-    def get_rust_tokenizer(self, **kwargs):
+    test_rust_tokenizer = True
-        return DPRQuestionEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
 class DPRReaderTokenizationTest(BertTokenizationTest):
    tokenizer_class = DPRReaderTokenizer
+    rust_tokenizer_class = DPRReaderTokenizerFast
-    def get_rust_tokenizer(self, **kwargs):
+    test_rust_tokenizer = True
-        return DPRReaderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
    @slow
    def test_decode_best_spans(self):

--- a/tests/test_tokenization_fast.py
+++ b/tests/test_tokenization_fast.py
 import logging
+import shutil
+import tempfile
 import unittest
 from collections import namedtuple
 from itertools import takewhile
 from transformers import (
+    AlbertTokenizer,
+    AlbertTokenizerFast,
+    BartTokenizer,
+    BartTokenizerFast,
    BertTokenizer,
    BertTokenizerFast,
+    CamembertTokenizer,
+    CamembertTokenizerFast,
    DistilBertTokenizer,
+    DistilBertTokenizerFast,
+    DPRContextEncoderTokenizer,
+    DPRContextEncoderTokenizerFast,
+    DPRQuestionEncoderTokenizer,
+    DPRQuestionEncoderTokenizerFast,
+    DPRReaderTokenizer,
+    DPRReaderTokenizerFast,
+    FunnelTokenizer,
+    FunnelTokenizerFast,
    GPT2Tokenizer,
    GPT2TokenizerFast,
+    LxmertTokenizer,
+    LxmertTokenizerFast,
+    MBartTokenizer,
+    MBartTokenizerFast,
    OpenAIGPTTokenizer,
-    PreTrainedTokenizer,
+    OpenAIGPTTokenizerFast,
+    PegasusTokenizer,
+    PegasusTokenizerFast,
+    ReformerTokenizer,
+    ReformerTokenizerFast,
    RobertaTokenizer,
+    RobertaTokenizerFast,
+    T5Tokenizer,
+    T5TokenizerFast,
+    XLMRobertaTokenizer,
+    XLMRobertaTokenizerFast,
+    XLNetTokenizer,
+    XLNetTokenizerFast,
    is_torch_available,
 )
 from transformers.testing_utils import get_tests_dir
-from transformers.tokenization_distilbert import DistilBertTokenizerFast
-from transformers.tokenization_openai import OpenAIGPTTokenizerFast
-from transformers.tokenization_roberta import RobertaTokenizerFast
 logger = logging.getLogger(__name__)
@@ -40,245 +69,261 @@ class CommonFastTokenizerTest(unittest.TestCase):
    TOKENIZERS_CLASSES = frozenset([])
    def setUp(self) -> None:
+        # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
+        # information available in Tokenizer (name, rust class, python class, vocab key name)
+        self.tokenizers_list = [
+            (tok_case, pretrained_name, dict(t for t in tok_case.kwargs) if tok_case.kwargs else {})
+            for tok_case in self.TOKENIZERS_CLASSES
+            for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys()
+            if tok_case.filter is None or (tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name))
+        ]
        with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
            self._data = f_data.read().replace("\n\n", "\n").strip()
-    def test_all_tokenizers(self):
+        self.tmpdirname = tempfile.mkdtemp()
-        for tok_case in self.TOKENIZERS_CLASSES:
-            for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys():
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
-                # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
-                # information available in Tokenizer (name, rust class, python class, vocab key name)
+    def test_is_fast(self):
-                if tok_case.filter is None or (
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-                    tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-                ):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-                    kwargs = dict(t for t in tok_case.kwargs) if tok_case.kwargs else {}
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-                    with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-                        tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                # Check is_fast is set correctly
-                        tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
+                self.assertFalse(tokenizer_p.is_fast)
+                self.assertTrue(tokenizer_r.is_fast)
-                        self.fast_align_python(tokenizer_r, tokenizer_p, tok_case, pretrained_name)
-                        self.fast_only(tokenizer_r)
+    def test_fast_only_inputs(self):
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-    def test_pretokenized_tokenizers(self):
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        for tok_case in self.TOKENIZERS_CLASSES:
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-            for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys():
+                # Ensure None raise an error
-                # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
+                self.assertRaises(TypeError, tokenizer_r.tokenize, None)
-                # information available in Tokenizer (name, rust class, python class, vocab key name)
+                self.assertRaises(TypeError, tokenizer_r.encode, None)
-                if tok_case.filter is None or (
+                self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
-                    tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name)
+                self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
-                ):
-                    with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+    def test_alignement_methods(self):
-                        tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, add_prefix_space=True)
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-                        tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, add_prefix_space=True)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-                        self.assert_pretokenized_inputs(tokenizer_r, tokenizer_p)
+                words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
-    def fast_align_python(self, tokenizer_r, tokenizer_p, tok_case, pretrained_name):
+                text = " ".join(words)
-        # Check is_fast is set correctly
+                batch_size = 3
-        self.assertFalse(tokenizer_p.is_fast)
-        self.assertTrue(tokenizer_r.is_fast)
+                encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
-        # Check that Rust and Python align
+                batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
-        self.assert_tokenization_python_rust_equals(tokenizer_r, tokenizer_p)
+                num_tokens = len(encoding["input_ids"])
-        self.assert_num_special_tokens_to_add_equal(tokenizer_r, tokenizer_p)
-        self.assert_max_length_equal(tokenizer_r, tokenizer_p)
+                last_word_index = len(words) - 1
-        self.assert_special_tokens_map_equal(tokenizer_r, tokenizer_p)
+                last_token_index = num_tokens - 1
-        self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p)
+                last_batch_index = batch_size - 1
-        self.assert_padding(tokenizer_r, tokenizer_p)
+                last_char_index = len(text) - 1
-        self.assert_create_token_type_ids(tokenizer_r, tokenizer_p)
-        self.assert_prepare_for_model(tokenizer_r, tokenizer_p)
+                # words, tokens
+                self.assertEqual(len(encoding.words(0)), num_tokens)
-    def fast_only(self, tokenizer_r):
+                self.assertEqual(max(encoding.words(0)), last_word_index)
-        # Ensure None raise an error
+                self.assertEqual(min(encoding.words(0)), 0)
-        self.assertRaises(ValueError, tokenizer_r.tokenize, None)
+                self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
-        self.assertRaises(ValueError, tokenizer_r.encode, None)
+                self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
-        self.assertRaises(ValueError, tokenizer_r.encode_plus, None)
+                self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
-        self.assertRaises(ValueError, tokenizer_r.batch_encode_plus, None)
+                self.assertEqual(len(encoding.tokens(0)), num_tokens)
-        self.assert_add_tokens(tokenizer_r)
+                # Assert token_to_word
-        self.assert_offsets_mapping(tokenizer_r)
+                self.assertEqual(encoding.token_to_word(0), 0)
-        self.assert_add_special_tokens(tokenizer_r)
+                self.assertEqual(encoding.token_to_word(0, 0), 0)
-        self.assert_alignement_methods(tokenizer_r)
+                self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
-        self.assert_batch_encode_dynamic_overflowing(tokenizer_r)
+                self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
+                self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
-    def assert_alignement_methods(self, tokenizer_r):
+                self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
-        words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
+                self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
-        text = " ".join(words)
-        batch_size = 3
+                # Assert word_to_tokens
+                self.assertEqual(encoding.word_to_tokens(0).start, 0)
-        encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
+                self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
+                self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
-        batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
+                self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
-        num_tokens = len(encoding["input_ids"])
+                self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
+                self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
-        last_word_index = len(words) - 1
+                self.assertEqual(
-        last_token_index = num_tokens - 1
+                    batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
-        last_batch_index = batch_size - 1
+                )
-        last_char_index = len(text) - 1
+                # Assert token_to_chars
-        # words, tokens
+                self.assertEqual(encoding.token_to_chars(0).start, 0)
-        self.assertEqual(len(encoding.words(0)), num_tokens)
+                self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
-        self.assertEqual(max(encoding.words(0)), last_word_index)
+                self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
-        self.assertEqual(min(encoding.words(0)), 0)
+                self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
-        self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
+                self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
-        self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
+                self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
-        self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
+                self.assertEqual(
-        self.assertEqual(len(encoding.tokens(0)), num_tokens)
+                    batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
+                )
-        # Assert token_to_word
-        self.assertEqual(encoding.token_to_word(0), 0)
+                # Assert char_to_token
-        self.assertEqual(encoding.token_to_word(0, 0), 0)
+                self.assertEqual(encoding.char_to_token(0), 0)
-        self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
+                self.assertEqual(encoding.char_to_token(0, 0), 0)
-        self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
+                self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
-        self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
+                self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
-        self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
+                self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
-        self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
+                self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
+                self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
-        # Assert word_to_tokens
-        self.assertEqual(encoding.word_to_tokens(0).start, 0)
+                # Assert char_to_word
-        self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
+                self.assertEqual(encoding.char_to_word(0), 0)
-        self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
+                self.assertEqual(encoding.char_to_word(0, 0), 0)
-        self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
+                self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
-        self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
+                self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
-        self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
+                self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
-        self.assertEqual(batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1)
+                self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
+                self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
-        # Assert token_to_chars
-        self.assertEqual(encoding.token_to_chars(0).start, 0)
+                # Assert word_to_chars
-        self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
+                self.assertEqual(encoding.word_to_chars(0).start, 0)
-        self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
+                self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
-        self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
+                self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
-        self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
+                self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
-        self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
+                self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
-        self.assertEqual(batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1)
+                self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
+                self.assertEqual(
-        # Assert char_to_token
+                    batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
-        self.assertEqual(encoding.char_to_token(0), 0)
+                )
-        self.assertEqual(encoding.char_to_token(0, 0), 0)
-        self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
+    def test_tokenization_python_rust_equals(self):
-        self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-        self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-        # Assert char_to_word
+                # Ensure basic input match
-        self.assertEqual(encoding.char_to_word(0), 0)
+                input_p = tokenizer_p.encode_plus(self._data)
-        self.assertEqual(encoding.char_to_word(0, 0), 0)
+                input_r = tokenizer_r.encode_plus(self._data)
-        self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
-        self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-        self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
+                    self.assertSequenceEqual(input_p[key], input_r[key])
-        self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
-        self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
+                input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
+                input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
-        # Assert word_to_chars
-        self.assertEqual(encoding.word_to_chars(0).start, 0)
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-        self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
+                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
-        self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
-        self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
+                # Ensure truncation match
-        self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
+                input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
-        self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
+                input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
-        self.assertEqual(batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1)
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-    def assert_tokenization_python_rust_equals(self, tokenizer_r, tokenizer_p):
+                    self.assertSequenceEqual(input_p[key], input_r[key])
-        # Ensure basic input match
-        input_p = tokenizer_p.encode_plus(self._data)
+                # Ensure truncation with stride match
-        input_r = tokenizer_r.encode_plus(self._data)
+                input_p = tokenizer_p.encode_plus(
+                    self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-        for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
+                )
-            self.assertSequenceEqual(input_p[key], input_r[key])
+                input_r = tokenizer_r.encode_plus(
+                    self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-        input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
+                )
-        input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-        for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
+                    self.assertSequenceEqual(input_p[key], input_r[key][0])
-            self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
+    def test_num_special_tokens_to_add_equal(self):
-        # Ensure truncation match
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-        for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-            self.assertSequenceEqual(input_p[key], input_r[key])
+                # Check we have the same number of added_tokens for both pair and non-pair inputs.
+                self.assertEqual(
-        # Ensure truncation with stride match
+                    tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
-        input_p = tokenizer_p.encode_plus(
+                )
-            self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
+                self.assertEqual(
-        )
+                    tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
-        input_r = tokenizer_r.encode_plus(
+                )
-            self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-        )
+    def test_max_length_equal(self):
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-            self.assertSequenceEqual(input_p[key], input_r[key][0])
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-    def assert_num_special_tokens_to_add_equal(self, tokenizer_r, tokenizer_p):
-        # Check we have the same number of added_tokens for both pair and non-pair inputs.
+                # Check we have the correct max_length for both pair and non-pair inputs.
-        self.assertEqual(tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False))
+                self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-        self.assertEqual(tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True))
+                self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
-    def assert_max_length_equal(self, tokenizer_r, tokenizer_p):
+    def test_special_tokens_map_equal(self):
-        # Check we have the correct max_length for both pair and non-pair inputs.
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-    def assert_special_tokens_map_equal(self, tokenizer_r, tokenizer_p):
-        # Assert the set of special tokens match.
+                # Assert the set of special tokens match.
-        self.assertSequenceEqual(
+                self.assertSequenceEqual(
-            tokenizer_p.special_tokens_map.items(),
+                    tokenizer_p.special_tokens_map.items(),
-            tokenizer_r.special_tokens_map.items(),
+                    tokenizer_r.special_tokens_map.items(),
-        )
+                )
-    def assert_add_tokens(self, tokenizer_r):
+    def test_add_tokens(self):
-        vocab_size = tokenizer_r.vocab_size
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        self.assertEqual(tokenizer_r.add_tokens(""), 0)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-        self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
-        self.assertEqual(len(tokenizer_r), vocab_size + 3)
+                vocab_size = len(tokenizer_r)
+                self.assertEqual(tokenizer_r.add_tokens(""), 0)
-        self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
+                self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
-        self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
+                self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
-        self.assertRaises(
+                self.assertEqual(len(tokenizer_r), vocab_size + 3)
-            AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
-        )
+                self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
-        self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
+                self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
-        self.assertEqual(
+                self.assertRaises(
-            tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
+                    AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
-        )
+                )
-        self.assertEqual(len(tokenizer_r), vocab_size + 8)
+                self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
+                self.assertEqual(
-    def assert_offsets_mapping(self, tokenizer_r):
+                    tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
-        text = "Wonderful no inspiration example with subtoken"
+                )
-        pair = "Along with an awesome pair"
+                self.assertEqual(len(tokenizer_r), vocab_size + 8)
-        # No pair
+    def test_offsets_mapping(self):
-        tokens_with_offsets = tokenizer_r.encode_plus(
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-            text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        )
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-        added_tokens = tokenizer_r.num_special_tokens_to_add(False)
-        offsets = tokens_with_offsets["offset_mapping"]
+                text = "Wonderful no inspiration example with subtoken"
+                pair = "Along with an awesome pair"
-        # Assert there is the same number of tokens and offsets
-        self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
+                # No pair
+                tokens_with_offsets = tokenizer_r.encode_plus(
-        # Assert there is online added_tokens special_tokens
+                    text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
-        self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
+                )
+                added_tokens = tokenizer_r.num_special_tokens_to_add(False)
-        # Pairs
+                offsets = tokens_with_offsets["offset_mapping"]
-        tokens_with_offsets = tokenizer_r.encode_plus(
-            text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
+                # Assert there is the same number of tokens and offsets
-        )
+                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
-        added_tokens = tokenizer_r.num_special_tokens_to_add(True)
-        offsets = tokens_with_offsets["offset_mapping"]
+                # Assert there is online added_tokens special_tokens
+                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
-        # Assert there is the same number of tokens and offsets
-        self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
+                # Pairs
+                tokens_with_offsets = tokenizer_r.encode_plus(
-        # Assert there is online added_tokens special_tokens
+                    text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
-        self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
+                )
+                added_tokens = tokenizer_r.num_special_tokens_to_add(True)
-    def assert_batch_encode_dynamic_overflowing(self, tokenizer: PreTrainedTokenizer):
+                offsets = tokens_with_offsets["offset_mapping"]
+                # Assert there is the same number of tokens and offsets
+                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
+                # Assert there is online added_tokens special_tokens
+                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
+    def test_batch_encode_dynamic_overflowing(self):
        """
        When calling batch_encode with multiple sequence it can returns different number of
        overflowing encoding for each sequence:
@@ -289,437 +334,515 @@ class CommonFastTokenizerTest(unittest.TestCase):
        ]
        This needs to be padded so that it can represented as a tensor
        """
-        returned_tensor = "pt" if is_torch_available() else "tf"
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
+            tokenizer = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-        if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
-            return
+            with self.subTest("{} ({}, {})".format(tok_case.name, pretrained_name, tokenizer.__class__.__name__)):
-        tokens = tokenizer.encode_plus(
+                returned_tensor = "pt" if is_torch_available() else "tf"
-            "HuggingFace is solving NLP one commit at a time",
-            max_length=6,
+                if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
-            padding=True,
+                    return
-            truncation=True,
-            return_tensors=returned_tensor,
+                tokens = tokenizer.encode_plus(
-            return_overflowing_tokens=True,
+                    "HuggingFace is solving NLP one commit at a time",
-        )
+                    max_length=6,
+                    padding=True,
-        for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
+                    truncation=True,
-            self.assertEqual(len(tokens[key].shape), 2)
+                    return_tensors=returned_tensor,
+                    return_overflowing_tokens=True,
-        # Mono sample
+                )
-        tokens = tokenizer.batch_encode_plus(
-            ["HuggingFace is solving NLP one commit at a time"],
+                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-            max_length=6,
+                    self.assertEqual(len(tokens[key].shape), 2)
-            padding=True,
-            truncation="only_first",
+                # Mono sample
-            return_tensors=returned_tensor,
+                tokens = tokenizer.batch_encode_plus(
-            return_overflowing_tokens=True,
+                    ["HuggingFace is solving NLP one commit at a time"],
-        )
+                    max_length=6,
+                    padding=True,
-        for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
+                    truncation="only_first",
-            self.assertEqual(len(tokens[key].shape), 2)
+                    return_tensors=returned_tensor,
-            self.assertEqual(tokens[key].shape[-1], 6)
+                    return_overflowing_tokens=True,
+                )
-        # Multi sample
-        tokens = tokenizer.batch_encode_plus(
+                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-            ["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
+                    self.assertEqual(len(tokens[key].shape), 2)
-            max_length=6,
+                    self.assertEqual(tokens[key].shape[-1], 6)
-            padding=True,
-            truncation="only_first",
+                # Multi sample
-            return_tensors=returned_tensor,
+                tokens = tokenizer.batch_encode_plus(
-            return_overflowing_tokens=True,
+                    ["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
-        )
+                    max_length=6,
+                    padding=True,
-        for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
+                    truncation="only_first",
-            self.assertEqual(len(tokens[key].shape), 2)
+                    return_tensors=returned_tensor,
-            self.assertEqual(tokens[key].shape[-1], 6)
+                    return_overflowing_tokens=True,
+                )
-    def assert_pretokenized_inputs(self, tokenizer_r, tokenizer_p):
-        # Input string
+                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-        pretokenized_input_simple = "This is a sample input".split()
+                    self.assertEqual(len(tokens[key].shape), 2)
-        pretokenized_input_pair = "This is a sample pair".split()
+                    self.assertEqual(tokens[key].shape[-1], 6)
-        # Test encode for pretokenized inputs
+    def test_pretokenized_inputs(self):
-        output_r = tokenizer_r.encode(pretokenized_input_simple, is_split_into_words=True)
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        output_p = tokenizer_p.encode(pretokenized_input_simple, is_split_into_words=True)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        self.assertEqual(output_p, output_r)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-        kwargs = {
+                # Input string
-            "is_split_into_words": True,
+                pretokenized_input_simple = "This is a sample input".split()
-            "return_token_type_ids": True,
+                pretokenized_input_pair = "This is a sample pair".split()
-            "return_attention_mask": True,
-            "return_overflowing_tokens": False,
+                # Test encode for pretokenized inputs
-            "return_special_tokens_mask": True,
+                output_r = tokenizer_r.encode(
-            "return_offsets_mapping": False,  # Not implemented in python tokenizers
+                    pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
-        }
+                )
-        batch_kwargs = {
+                output_p = tokenizer_p.encode(
-            "is_split_into_words": True,
+                    pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
-            "return_token_type_ids": True,
+                )
-            "return_attention_mask": True,  # we have an 's' here
+                self.assertEqual(output_p, output_r)
-            "return_overflowing_tokens": False,
-            "return_special_tokens_mask": True,  # we have an 's' here
+                kwargs = {
-            "return_offsets_mapping": False,  # Not implemented in python tokenizers
+                    "is_split_into_words": True,
-        }
+                    # "return_token_type_ids": True,  # Use the defaults for each tokenizers
-        # Test encode_plus for pretokenized inputs
+                    # "return_attention_mask": True,  # Use the defaults for each tokenizers
-        output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
+                    "return_overflowing_tokens": False,
-        output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
+                    "return_special_tokens_mask": True,
-        for key in output_p.keys():
+                    "return_offsets_mapping": False,  # Not implemented in python tokenizers
-            self.assertEqual(output_p[key], output_r[key])
+                    # "add_special_tokens": False,
+                }
-        # Test batch_encode_plus for pretokenized inputs
+                batch_kwargs = {
-        input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
+                    "is_split_into_words": True,
-        output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
+                    # "return_token_type_ids": True,  # Use the defaults for each tokenizers
-        output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
+                    # "return_attention_mask": True,  # Use the defaults for each tokenizers
-        for key in output_p.keys():
+                    "return_overflowing_tokens": False,
-            self.assertEqual(output_p[key], output_r[key])
+                    "return_special_tokens_mask": True,
+                    "return_offsets_mapping": False,  # Not implemented in python tokenizers
-        # Test encode for pretokenized inputs pairs
+                    # "add_special_tokens": False,
-        output_r = tokenizer_r.encode(pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True)
+                }
-        output_p = tokenizer_p.encode(pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True)
+                # Test encode_plus for pretokenized inputs
-        self.assertEqual(output_p, output_r)
+                output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
+                output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
-        # Test encode_plus for pretokenized inputs
+                for key in output_p.keys():
-        output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
+                    self.assertEqual(output_p[key], output_r[key])
-        output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
-        for key in output_p.keys():
+                # Test batch_encode_plus for pretokenized inputs
-            self.assertEqual(output_p[key], output_r[key])
+                input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
+                output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
-        # Test batch_encode_plus for pretokenized inputs
+                output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
-        input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
+                for key in output_p.keys():
-            pretokenized_input_simple + pretokenized_input_pair,
+                    self.assertEqual(output_p[key], output_r[key])
-            pretokenized_input_pair,
-        ]
+                # Test encode for pretokenized inputs pairs
-        output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
+                output_r = tokenizer_r.encode(
-        output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
+                    pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
-        for key in output_p.keys():
+                )
-            self.assertEqual(output_p[key], output_r[key])
+                output_p = tokenizer_p.encode(
+                    pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
-    def assert_create_token_type_ids(self, tokenizer_r, tokenizer_p):
+                )
-        input_simple = [1, 2, 3]
+                self.assertEqual(output_p, output_r)
-        input_pair = [1, 2, 3]
+                # Test encode_plus for pretokenized inputs
-        # Generate output
+                output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
-        output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
+                output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
-        output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
+                for key in output_p.keys():
-        self.assertEqual(output_p, output_r)
+                    self.assertEqual(output_p[key], output_r[key])
-        # Generate pair output
+                # Test batch_encode_plus for pretokenized inputs
-        output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
+                input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
-        output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
+                    pretokenized_input_simple + pretokenized_input_pair,
-        self.assertEqual(output_p, output_r)
+                    pretokenized_input_pair,
+                ]
-    def assert_build_inputs_with_special_tokens(self, tokenizer_r, tokenizer_p):
+                output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
-        # Input string
+                output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
-        input_simple = tokenizer_p.tokenize("This is a sample input")
+                for key in output_p.keys():
-        input_pair = tokenizer_p.tokenize("This is a sample pair")
+                    self.assertEqual(output_p[key], output_r[key])
-        # Generate output
+    def test_create_token_type_ids(self):
-        output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        self.assertEqual(output_p, output_r)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-        # Generate pair output
+                input_simple = [1, 2, 3]
-        output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
+                input_pair = [1, 2, 3]
-        output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-        self.assertEqual(output_p, output_r)
+                # Generate output
+                output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
-        # Input tokens id
+                output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
-        input_simple = tokenizer_p.encode("This is a sample input")
+                self.assertEqual(output_p, output_r)
-        input_pair = tokenizer_p.encode("This is a sample pair")
+                # Generate pair output
-        # Generate output
+                output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
-        output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
+                output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
-        output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
+                self.assertEqual(output_p, output_r)
-        self.assertEqual(output_p, output_r)
+    def test_build_inputs_with_special_tokens(self):
-        # Generate pair output
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-        self.assertEqual(output_p, output_r)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
+                # # Input string
-    def assert_padding(self, tokenizer_r, tokenizer_p, max_length=15):
+                # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
-        def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
+                # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
-            # Ensure we match max_length
+                # # Generate output
-            self.assertEqual(len(input_r), max_length)
+                # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-            self.assertEqual(len(input_p), max_length)
+                # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
+                # self.assertEqual(output_p, output_r)
-            # Ensure the number of padded tokens is the same
-            padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
+                # # Generate pair output
-            padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
+                # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-            self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
+                # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
+                # self.assertEqual(output_p, output_r)
-        def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
-            for i_r in input_r.values():
+                # Input tokens id
-                self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
+                input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
-                    len(i_r[1]), max_length
+                input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
-                )
-                self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
+                # Generate output
-                    len(i_r[1]), max_length
+                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                )
+                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
+                self.assertEqual(output_p, output_r)
-            for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
-                assert_padded_input_match(i_r, i_p, max_length)
+                # Generate pair output
+                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-            for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
+                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                self.assertSequenceEqual(i_r, i_p)
+                self.assertEqual(output_p, output_r)
-        # Encode - Simple input
+    def test_padding(self, max_length=50):
-        input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        assert_padded_input_match(input_r, input_p, max_length)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-        input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
-        input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
-        assert_padded_input_match(input_r, input_p, max_length)
+                def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
-        input_r = tokenizer_r.encode("This is a simple input", padding="longest")
+                    # Ensure we match max_length
-        input_p = tokenizer_p.encode("This is a simple input", padding=True)
+                    self.assertEqual(len(input_r), max_length)
-        assert_padded_input_match(input_r, input_p, len(input_r))
+                    self.assertEqual(len(input_p), max_length)
-        # Encode - Pair input
+                    # Ensure the number of padded tokens is the same
-        input_r = tokenizer_r.encode(
+                    padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
-            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                    padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
-        )
+                    self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
-        input_p = tokenizer_p.encode(
-            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
-        )
+                    for i_r in input_r.values():
-        assert_padded_input_match(input_r, input_p, max_length)
+                        self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
-        input_r = tokenizer_r.encode(
+                            len(i_r[1]), max_length
-            "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                        )
-        )
+                        self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
-        input_p = tokenizer_p.encode(
+                            len(i_r[1]), max_length
-            "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                        )
-        )
-        assert_padded_input_match(input_r, input_p, max_length)
+                    for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
-        input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
+                        assert_padded_input_match(i_r, i_p, max_length)
-        input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
-        assert_padded_input_match(input_r, input_p, len(input_r))
+                    for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
+                        self.assertSequenceEqual(i_r, i_p)
-        # Encode_plus - Simple input
-        input_r = tokenizer_r.encode_plus("This is a simple input", max_length=max_length, pad_to_max_length=True)
+                # Encode - Simple input
-        input_p = tokenizer_p.encode_plus("This is a simple input", max_length=max_length, pad_to_max_length=True)
+                input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
+                input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
-        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                assert_padded_input_match(input_r, input_p, max_length)
-        input_r = tokenizer_r.encode_plus("This is a simple input", max_length=max_length, padding="max_length")
+                input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
-        input_p = tokenizer_p.encode_plus("This is a simple input", max_length=max_length, padding="max_length")
+                input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
+                assert_padded_input_match(input_r, input_p, max_length)
-        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                input_r = tokenizer_r.encode("This is a simple input", padding="longest")
-        input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
+                input_p = tokenizer_p.encode("This is a simple input", padding=True)
-        input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
+                assert_padded_input_match(input_r, input_p, len(input_r))
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
+                # Encode - Pair input
-        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                input_r = tokenizer_r.encode(
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-        # Encode_plus - Pair input
+                )
-        input_r = tokenizer_r.encode_plus(
+                input_p = tokenizer_p.encode(
-            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-        )
+                )
-        input_p = tokenizer_p.encode_plus(
+                assert_padded_input_match(input_r, input_p, max_length)
-            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                input_r = tokenizer_r.encode(
-        )
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
+                )
-        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                input_p = tokenizer_p.encode(
-        input_r = tokenizer_r.encode_plus(
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-            "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                )
-        )
+                assert_padded_input_match(input_r, input_p, max_length)
-        input_p = tokenizer_p.encode_plus(
+                input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
-            "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
-        )
+                assert_padded_input_match(input_r, input_p, len(input_r))
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
-        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                # Encode_plus - Simple input
-        input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
+                input_r = tokenizer_r.encode_plus(
-        input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
+                    "This is a simple input", max_length=max_length, pad_to_max_length=True
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
+                )
-        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                input_p = tokenizer_p.encode_plus(
+                    "This is a simple input", max_length=max_length, pad_to_max_length=True
-        # Batch_encode_plus - Simple input
+                )
-        input_r = tokenizer_r.batch_encode_plus(
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
-            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, pad_to_max_length=True
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-        )
+                input_r = tokenizer_r.encode_plus(
-        input_p = tokenizer_p.batch_encode_plus(
+                    "This is a simple input", max_length=max_length, padding="max_length"
-            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, pad_to_max_length=True
+                )
-        )
+                input_p = tokenizer_p.encode_plus(
-        assert_batch_padded_input_match(input_r, input_p, max_length)
+                    "This is a simple input", max_length=max_length, padding="max_length"
+                )
-        input_r = tokenizer_r.batch_encode_plus(
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
-            ["This is a simple input 1", "This is a simple input 2"],
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-            max_length=max_length,
-            padding="max_length",
+                input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
-        )
+                input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
-        input_p = tokenizer_p.batch_encode_plus(
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
-            ["This is a simple input 1", "This is a simple input 2"],
-            max_length=max_length,
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-            padding="max_length",
-        )
+                # Encode_plus - Pair input
-        assert_batch_padded_input_match(input_r, input_p, max_length)
+                input_r = tokenizer_r.encode_plus(
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-        input_r = tokenizer_r.batch_encode_plus(
+                )
-            ["This is a simple input 1", "This is a simple input 2"],
+                input_p = tokenizer_p.encode_plus(
-            max_length=max_length,
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-            padding="longest",
+                )
-        )
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
-        input_p = tokenizer_p.batch_encode_plus(
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-            ["This is a simple input 1", "This is a simple input 2"],
+                input_r = tokenizer_r.encode_plus(
-            max_length=max_length,
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-            padding=True,
+                )
-        )
+                input_p = tokenizer_p.encode_plus(
-        assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                )
-        input_r = tokenizer_r.batch_encode_plus(
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
-            ["This is a simple input 1", "This is a simple input 2"], padding="longest"
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-        )
+                input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
-        input_p = tokenizer_p.batch_encode_plus(["This is a simple input 1", "This is a simple input 2"], padding=True)
+                input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
-        assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-        # Batch_encode_plus - Pair input
-        input_r = tokenizer_r.batch_encode_plus(
+                # Batch_encode_plus - Simple input
-            [
+                input_r = tokenizer_r.batch_encode_plus(
-                ("This is a simple input 1", "This is a simple input 2"),
+                    ["This is a simple input 1", "This is a simple input 2"],
-                ("This is a simple pair 1", "This is a simple pair 2"),
+                    max_length=max_length,
-            ],
+                    pad_to_max_length=True,
-            max_length=max_length,
+                )
-            truncation=True,
+                input_p = tokenizer_p.batch_encode_plus(
-            padding="max_length",
+                    ["This is a simple input 1", "This is a simple input 2"],
-        )
+                    max_length=max_length,
-        input_p = tokenizer_p.batch_encode_plus(
+                    pad_to_max_length=True,
-            [
+                )
-                ("This is a simple input 1", "This is a simple input 2"),
+                assert_batch_padded_input_match(input_r, input_p, max_length)
-                ("This is a simple pair 1", "This is a simple pair 2"),
-            ],
+                input_r = tokenizer_r.batch_encode_plus(
-            max_length=max_length,
+                    ["This is a simple input 1", "This is a simple input 2"],
-            truncation=True,
+                    max_length=max_length,
-            padding="max_length",
+                    padding="max_length",
-        )
+                )
-        assert_batch_padded_input_match(input_r, input_p, max_length)
+                input_p = tokenizer_p.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"],
-        input_r = tokenizer_r.batch_encode_plus(
+                    max_length=max_length,
-            [
+                    padding="max_length",
-                ("This is a simple input 1", "This is a simple input 2"),
+                )
-                ("This is a simple pair 1", "This is a simple pair 2"),
+                assert_batch_padded_input_match(input_r, input_p, max_length)
-            ],
-            padding=True,
+                input_r = tokenizer_r.batch_encode_plus(
-        )
+                    ["This is a simple input 1", "This is a simple input 2"],
-        input_p = tokenizer_p.batch_encode_plus(
+                    max_length=max_length,
-            [
+                    padding="longest",
-                ("This is a simple input 1", "This is a simple input 2"),
+                )
-                ("This is a simple pair 1", "This is a simple pair 2"),
+                input_p = tokenizer_p.batch_encode_plus(
-            ],
+                    ["This is a simple input 1", "This is a simple input 2"],
-            padding="longest",
+                    max_length=max_length,
-        )
+                    padding=True,
-        assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
+                )
+                assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
-        # Using pad on single examples after tokenization
-        input_r = tokenizer_r.encode_plus("This is a input 1")
+                input_r = tokenizer_r.batch_encode_plus(
-        input_r = tokenizer_r.pad(input_r)
+                    ["This is a simple input 1", "This is a simple input 2"], padding="longest"
+                )
-        input_p = tokenizer_r.encode_plus("This is a input 1")
+                input_p = tokenizer_p.batch_encode_plus(
-        input_p = tokenizer_r.pad(input_p)
+                    ["This is a simple input 1", "This is a simple input 2"], padding=True
+                )
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
+                assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
-        # Using pad on single examples after tokenization
+                # Batch_encode_plus - Pair input
-        input_r = tokenizer_r.encode_plus("This is a input 1")
+                input_r = tokenizer_r.batch_encode_plus(
-        input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
+                    [
+                        ("This is a simple input 1", "This is a simple input 2"),
-        input_p = tokenizer_r.encode_plus("This is a input 1")
+                        ("This is a simple pair 1", "This is a simple pair 2"),
-        input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
+                    ],
+                    max_length=max_length,
-        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
+                    truncation=True,
+                    padding="max_length",
-        # Using pad after tokenization
+                )
-        input_r = tokenizer_r.batch_encode_plus(
+                input_p = tokenizer_p.batch_encode_plus(
-            ["This is a input 1", "This is a much longer input whilch should be padded"]
+                    [
-        )
+                        ("This is a simple input 1", "This is a simple input 2"),
-        input_r = tokenizer_r.pad(input_r)
+                        ("This is a simple pair 1", "This is a simple pair 2"),
+                    ],
-        input_p = tokenizer_r.batch_encode_plus(
+                    max_length=max_length,
-            ["This is a input 1", "This is a much longer input whilch should be padded"]
+                    truncation=True,
-        )
+                    padding="max_length",
-        input_p = tokenizer_r.pad(input_p)
+                )
+                assert_batch_padded_input_match(input_r, input_p, max_length)
-        assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
+                input_r = tokenizer_r.batch_encode_plus(
-        # Using pad after tokenization
+                    [
-        input_r = tokenizer_r.batch_encode_plus(
+                        ("This is a simple input 1", "This is a simple input 2"),
-            ["This is a input 1", "This is a much longer input whilch should be padded"]
+                        ("This is a simple pair 1", "This is a simple pair 2"),
-        )
+                    ],
-        input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
+                    padding=True,
+                )
-        input_p = tokenizer_r.batch_encode_plus(
+                input_p = tokenizer_p.batch_encode_plus(
-            ["This is a input 1", "This is a much longer input whilch should be padded"]
+                    [
-        )
+                        ("This is a simple input 1", "This is a simple input 2"),
-        input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
+                        ("This is a simple pair 1", "This is a simple pair 2"),
+                    ],
-        assert_batch_padded_input_match(input_r, input_p, max_length)
+                    padding="longest",
+                )
-    def assert_save_pretrained(self, tokenizer_r, tokenizer_p):
+                assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
-        # Checks it save with the same files
-        self.assertSequenceEqual(tokenizer_r.save_vocabulary("."), tokenizer_p.save_vocabulary("."))
+                # Using pad on single examples after tokenization
+                input_r = tokenizer_r.encode_plus("This is a input 1")
-        # Checks everything loads correctly in the same way
+                input_r = tokenizer_r.pad(input_r)
-        tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained("."), tokenizer_p.from_pretrained(".")
+                input_p = tokenizer_r.encode_plus("This is a input 1")
-        # Check special tokens are set accordingly on Rust and Python
+                input_p = tokenizer_r.pad(input_p)
-        for key in tokenizer_pp.special_tokens_map:
-            self.assertTrue(hasattr(tokenizer_rp, key))
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
-            # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
-            # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
+                # Using pad on single examples after tokenization
+                input_r = tokenizer_r.encode_plus("This is a input 1")
-    def assert_embeded_special_tokens(self, tokenizer_r, tokenizer_p):
+                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-        sentence = "A, <mask> AllenNLP sentence."
-        tokens_r = tokenizer_r.encode_plus(
+                input_p = tokenizer_r.encode_plus("This is a input 1")
-            sentence, add_special_tokens=True, return_attention_mask=False, return_token_type_ids=True
+                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
-        )
-        tokens_p = tokenizer_p.encode_plus(
+                assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
-            sentence, add_special_tokens=True, return_attention_mask=False, return_token_type_ids=True
-        )
+                # Using pad after tokenization
+                input_r = tokenizer_r.batch_encode_plus(
-        for key in tokens_p.keys():
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-            self.assertEqual(tokens_r[key], tokens_p[key])
+                )
+                input_r = tokenizer_r.pad(input_r)
-        self.assertEqual(sum(tokens_r["token_type_ids"]), 0)
-        self.assertEqual(sum(tokens_p["token_type_ids"]), 0)
+                input_p = tokenizer_r.batch_encode_plus(
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-        tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-        tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-        self.assertSequenceEqual(tokens_r, tokens_p)
-    def assert_add_special_tokens(self, tokenizer_r):
-        simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
-        # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
-        for text in ["", " "]:
-            # tokenize()
-            no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
-            with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
-            self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
-            # encode()
-            no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
-            with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
-            self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
-            # encode_plus()
-            no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
-            with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
-            for key in no_special_tokens.keys():
-                self.assertEqual(
-                    len(no_special_tokens[key]), len(with_special_tokens[key]) - simple_num_special_tokens_to_add
                )
+                input_p = tokenizer_r.pad(input_p)
-            # # batch_encode_plus
+                assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
-            no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
-            with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
+                # Using pad after tokenization
-            for key in no_special_tokens.keys():
+                input_r = tokenizer_r.batch_encode_plus(
-                for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                    self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
+                )
+                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-    def assert_prepare_for_model(self, tokenizer_r, tokenizer_p):
+                input_p = tokenizer_r.batch_encode_plus(
-        string_sequence = "Asserting that both tokenizers are equal"
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-        python_output = tokenizer_p.prepare_for_model(tokenizer_p.encode(string_sequence))
+                )
-        rust_output = tokenizer_r.prepare_for_model(tokenizer_r.encode(string_sequence))
+                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
-        self.assertEqual(python_output, rust_output)
+                assert_batch_padded_input_match(input_r, input_p, max_length)
+    def test_save_pretrained(self):
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
+                # Checks it save with the same files
+                self.assertSequenceEqual(
+                    tokenizer_r.save_vocabulary(self.tmpdirname), tokenizer_p.save_vocabulary(self.tmpdirname)
+                )
+                # Checks everything loads correctly in the same way
+                tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained(self.tmpdirname), tokenizer_p.from_pretrained(
+                    self.tmpdirname
+                )
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+                    # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
+                    # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
+    def test_embeded_special_tokens(self):
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
+                sentence = "A, <mask> AllenNLP sentence."
+                tokens_r = tokenizer_r.encode_plus(
+                    sentence,
+                    add_special_tokens=True,
+                )
+                tokens_p = tokenizer_p.encode_plus(
+                    sentence,
+                    add_special_tokens=True,
+                )
+                for key in tokens_p.keys():
+                    self.assertEqual(tokens_r[key], tokens_p[key])
+                if "token_type_ids" in tokens_r:
+                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
+                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
+                self.assertSequenceEqual(tokens_r, tokens_p)
+    def test_add_special_tokens(self):
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
+                # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
+                for text in ["", " "]:
+                    # tokenize()
+                    no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
+                    self.assertEqual(
+                        len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
+                    )
+                    # encode()
+                    no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
+                    self.assertEqual(
+                        len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
+                    )
+                    # encode_plus()
+                    no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
+                    for key in no_special_tokens.keys():
+                        self.assertEqual(
+                            len(no_special_tokens[key]),
+                            len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
+                        )
+                    # # batch_encode_plus
+                    no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
+                    for key in no_special_tokens.keys():
+                        for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
+                            self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
+    def test_prepare_for_model(self):
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
+                string_sequence = "Asserting that both tokenizers are equal"
+                python_output = tokenizer_p.prepare_for_model(
+                    tokenizer_p.encode(string_sequence, add_special_tokens=False)
+                )
+                rust_output = tokenizer_r.prepare_for_model(
+                    tokenizer_r.encode(string_sequence, add_special_tokens=False)
+                )
+                for key in python_output:
+                    self.assertEqual(python_output[key], rust_output[key])
 class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
@@ -733,61 +856,86 @@ class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
            Tokenizer(
                "DistilBert", DistilBertTokenizerFast, DistilBertTokenizer, "vocab_file", filter_non_english, None
            ),
+            Tokenizer(
+                "DPRReaderTokenizer",
+                DPRReaderTokenizerFast,
+                DPRReaderTokenizer,
+                "vocab_file",
+                filter_non_english,
+                None,
+            ),
+            Tokenizer(
+                "DPRQuestionEncoderTokenizer",
+                DPRQuestionEncoderTokenizerFast,
+                DPRQuestionEncoderTokenizer,
+                "vocab_file",
+                filter_non_english,
+                None,
+            ),
+            Tokenizer(
+                "DPRContextEncoderTokenizer",
+                DPRContextEncoderTokenizerFast,
+                DPRContextEncoderTokenizer,
+                "vocab_file",
+                filter_non_english,
+                None,
+            ),
+            Tokenizer("FunnelTokenizer", FunnelTokenizerFast, FunnelTokenizer, "vocab_file", filter_non_english, None),
+            Tokenizer("LxmertTokenizer", LxmertTokenizerFast, LxmertTokenizer, "vocab_file", filter_non_english, None),
        ]
    )
-    def fast_only(self, tokenizer_r):
+    def test_offsets_with_special_characters(self):
-        super().fast_only(tokenizer_r)
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        self.assert_offsets_with_special_characters(tokenizer_r)
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-    def assert_add_special_tokens(self, tokenizer_r):
-        super().assert_add_special_tokens(tokenizer_r)
+                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
+                tokens = tokenizer_r.encode_plus(
-    def assert_offsets_with_special_characters(self, tokenizer_r):
+                    sentence,
-        sentence = "A, naïve [MASK] AllenNLP sentence."
+                    return_attention_mask=False,
-        tokens = tokenizer_r.encode_plus(
+                    return_token_type_ids=False,
-            sentence,
+                    return_offsets_mapping=True,
-            return_attention_mask=False,
+                    add_special_tokens=True,
-            return_token_type_ids=False,
+                )
-            return_offsets_mapping=True,
-            add_special_tokens=True,
+                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-        )
+                expected_results = (
+                    [
-        do_lower_case = tokenizer_r.init_kwargs.get("do_lower_case")
+                        ((0, 0), tokenizer_r.cls_token),
-        expected_results = (
+                        ((0, 1), "A"),
-            [
+                        ((1, 2), ","),
-                ((0, 0), "[CLS]"),
+                        ((3, 5), "na"),
-                ((0, 1), "A"),
+                        ((5, 6), "##ï"),
-                ((1, 2), ","),
+                        ((6, 8), "##ve"),
-                ((3, 5), "na"),
+                        ((9, 15), tokenizer_r.mask_token),
-                ((5, 6), "##ï"),
+                        ((16, 21), "Allen"),
-                ((6, 8), "##ve"),
+                        ((21, 23), "##NL"),
-                ((9, 15), "[MASK]"),
+                        ((23, 24), "##P"),
-                ((16, 21), "Allen"),
+                        ((25, 33), "sentence"),
-                ((21, 23), "##NL"),
+                        ((33, 34), "."),
-                ((23, 24), "##P"),
+                        ((0, 0), tokenizer_r.sep_token),
-                ((25, 33), "sentence"),
+                    ]
-                ((33, 34), "."),
+                    if not do_lower_case
-                ((0, 0), "[SEP]"),
+                    else [
-            ]
+                        ((0, 0), tokenizer_r.cls_token),
-            if not do_lower_case
+                        ((0, 1), "a"),
-            else [
+                        ((1, 2), ","),
-                ((0, 0), "[CLS]"),
+                        ((3, 8), "naive"),
-                ((0, 1), "a"),
+                        ((9, 15), tokenizer_r.mask_token),
-                ((1, 2), ","),
+                        ((16, 21), "allen"),
-                ((3, 8), "naive"),
+                        ((21, 23), "##nl"),
-                ((9, 15), "[MASK]"),
+                        ((23, 24), "##p"),
-                ((16, 21), "allen"),
+                        ((25, 33), "sentence"),
-                ((21, 23), "##nl"),
+                        ((33, 34), "."),
-                ((23, 24), "##p"),
+                        ((0, 0), tokenizer_r.sep_token),
-                ((25, 33), "sentence"),
+                    ]
-                ((33, 34), "."),
+                )
-                ((0, 0), "[SEP]"),
-            ]
+                self.assertEqual(
-        )
+                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
+                )
-        self.assertEqual([e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]))
+                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-        self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
 class RobertaFastTokenizerTest(CommonFastTokenizerTest):
@@ -800,32 +948,52 @@ class RobertaFastTokenizerTest(CommonFastTokenizerTest):
                "vocab_file",
                filter_roberta_detectors,
                (("cls_token", "<s>"),),
-            )
+            ),
+            Tokenizer(
+                "Bart",
+                BartTokenizerFast,
+                BartTokenizer,
+                "vocab_file",
+                None,
+                None,
+            ),
        ]
    )
-    def assert_embeded_special_tokens(self, tokenizer_r, tokenizer_p):
+    def test_pretokenized_inputs(self):
-        sentence = "A, <mask> AllenNLP sentence."
+        pass
-        tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-        tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-        # Rust correctly handles the space before the mask while python doesnt
+    def test_embeded_special_tokens(self):
-        self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
+                sentence = "A, <mask> AllenNLP sentence."
+                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+                # token_type_ids should put 0 everywhere
+                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+                # attention_mask should put 1 everywhere, so sum over length should be 1
+                self.assertEqual(
+                    sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
+                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
+                )
-        # token_type_ids should put 0 everywhere
+                tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-        self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-        # attention_mask should put 1 everywhere, so sum over length should be 1
+                # Rust correctly handles the space before the mask while python doesnt
-        self.assertEqual(
+                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
-            sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
+                self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
-            sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
-        )
-        tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
+                self.assertSequenceEqual(
-        tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
+                    tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
-        self.assertSequenceEqual(tokens_r, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"])
+                )
-        self.assertSequenceEqual(tokens_p, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"])
+                self.assertSequenceEqual(
+                    tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
+                )
 class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
@@ -834,62 +1002,75 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
        Tokenizer("GPT2", GPT2TokenizerFast, GPT2Tokenizer, "vocab_file", None, [("add_prefix_space", True)]),
    ]
-    def fast_align_python(self, tokenizer_r, tokenizer_p, tok_case, pretrained_name):
+    def test_pretokenized_inputs(self):
-        # Check is_fast is set correctly
+        pass
-        self.assertFalse(tokenizer_p.is_fast)
-        self.assertTrue(tokenizer_r.is_fast)
+    def test_padding(self, max_length=15):
+        for tok_case, pretrained_name, kwargs in self.tokenizers_list:
-        # Check that Rust and Python align
+            with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
-        self.assert_tokenization_python_rust_equals(tokenizer_r, tokenizer_p)
+                tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
-        self.assert_num_special_tokens_to_add_equal(tokenizer_r, tokenizer_p)
-        self.assert_max_length_equal(tokenizer_r, tokenizer_p)
+                # Simple input
-        self.assert_special_tokens_map_equal(tokenizer_r, tokenizer_p)
+                s = "This is a simple input"
-        self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p)
+                s2 = ["This is a simple input 1", "This is a simple input 2"]
-        self.assert_padding(tokenizer_r, tokenizer_p)
+                p = ("This is a simple input", "This is a pair")
+                p2 = [
-        # Specific for
+                    ("This is a simple input 1", "This is a simple input 2"),
-        kwargs = {}
+                    ("This is a simple pair 1", "This is a simple pair 2"),
-        if tok_case.kwargs is not None:
+                ]
-            kwargs = dict(tok_case.kwargs)
-        tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
+                # Simple input tests
-        self.assert_pretokenized_inputs(tokenizer_r, tokenizer_p)
+                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
-    def assert_padding(self, tokenizer_r, tokenizer_p, max_length=15):
+                # Simple input
-        # Simple input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
-        s = "This is a simple input"
-        s2 = ["This is a simple input 1", "This is a simple input 2"]
+                # Simple input
-        p = ("This is a simple input", "This is a pair")
+                self.assertRaises(
-        p2 = [
+                    ValueError,
-            ("This is a simple input 1", "This is a simple input 2"),
+                    tokenizer_r.batch_encode_plus,
-            ("This is a simple pair 1", "This is a simple pair 2"),
+                    s2,
-        ]
+                    max_length=max_length,
+                    padding="max_length",
+                )
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
-        # Simple input tests
+                # Pair input
-        self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
-        # Simple input
+                # Pair input
-        self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+                self.assertRaises(
+                    ValueError,
-        # Simple input
+                    tokenizer_r.batch_encode_plus,
-        self.assertRaises(
+                    p2,
-            ValueError,
+                    max_length=max_length,
-            tokenizer_r.batch_encode_plus,
+                    padding="max_length",
-            s2,
+                )
-            max_length=max_length,
-            padding="max_length",
-        )
+class SentencePieceFastTokenizerTest(CommonFastTokenizerTest):
+    """
-        # Pair input
+    Override specific methods to test SentencePiece behavior
-        self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+    """
-        # Pair input
+    TOKENIZERS_CLASSES = frozenset(
-        self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+        [
+            Tokenizer("Albert", AlbertTokenizerFast, AlbertTokenizer, "vocab_file", None, None),
-        # Pair input
+            Tokenizer("Camembert", CamembertTokenizerFast, CamembertTokenizer, "vocab_file", None, None),
-        self.assertRaises(
+            Tokenizer("T5", T5TokenizerFast, T5Tokenizer, "vocab_file", None, None),
-            ValueError,
+            Tokenizer(
-            tokenizer_r.batch_encode_plus,
+                "MBart",
-            p2,
+                MBartTokenizerFast,
-            max_length=max_length,
+                MBartTokenizer,
-            padding="max_length",
+                "vocab_file",
-        )
+                None,
+                None,
+            ),
+            Tokenizer("Pegasus", PegasusTokenizerFast, PegasusTokenizer, "vocab_file", None, None),
+            Tokenizer("Reformer", ReformerTokenizerFast, ReformerTokenizer, "vocab_file", None, None),
+            Tokenizer("XLMRoberta", XLMRobertaTokenizerFast, XLMRobertaTokenizer, "vocab_file", None, None),
+            Tokenizer("XLNet", XLNetTokenizerFast, XLNetTokenizer, "vocab_file", None, None),
+        ]
+    )
--- a/tests/test_tokenization_funnel.py
+++ b/tests/test_tokenization_funnel.py
@@ -26,6 +26,7 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = FunnelTokenizer
    test_rust_tokenizer = True
+    space_between_special_tokens = True
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_gpt2.py
+++ b/tests/test_tokenization_gpt2.py
@@ -26,6 +26,7 @@ from .test_tokenization_common import TokenizerTesterMixin
 class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = GPT2Tokenizer
+    rust_tokenizer_class = GPT2TokenizerFast
    test_rust_tokenizer = True
    def setUp(self):

--- a/tests/test_tokenization_lxmert.py
+++ b/tests/test_tokenization_lxmert.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from transformers.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.tokenization_lxmert import LxmertTokenizer
+from transformers.tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
 from .test_tokenization_common import TokenizerTesterMixin
@@ -26,6 +26,9 @@ from .test_tokenization_common import TokenizerTesterMixin
 class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = LxmertTokenizer
+    rust_tokenizer_class = LxmertTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
    def setUp(self):
        super().setUp()
@@ -49,9 +52,6 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-    def get_tokenizer(self, **kwargs):
-        return LxmertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
    def get_input_output_texts(self, tokenizer):
        input_text = "UNwant\u00E9d,running"
        output_text = "unwanted, running"
@@ -63,3 +63,25 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        sequence = "I was born in 92000, and this is falsé."
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
--- a/tests/test_tokenization_marian.py
+++ b/tests/test_tokenization_marian.py
@@ -38,6 +38,7 @@ FRAMEWORK = "pt" if _torch_available else "tf"
 class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = MarianTokenizer
+    test_rust_tokenizer = False
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_mbart.py
+++ b/tests/test_tokenization_mbart.py
 import tempfile
 import unittest
-from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, is_torch_available
+from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available
 from transformers.testing_utils import require_torch
 from .test_tokenization_common import TokenizerTesterMixin
@@ -17,6 +17,8 @@ RO_CODE = 250020
 class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = MBartTokenizer
+    rust_tokenizer_class = MBartTokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_openai.py
+++ b/tests/test_tokenization_openai.py
@@ -18,7 +18,7 @@ import json
 import os
 import unittest
-from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer
+from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
 from .test_tokenization_common import TokenizerTesterMixin
@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
 class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = OpenAIGPTTokenizer
+    rust_tokenizer_class = OpenAIGPTTokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_pegasus.py
+++ b/tests/test_tokenization_pegasus.py
@@ -3,7 +3,7 @@ from pathlib import Path
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch
-from transformers.tokenization_pegasus import PegasusTokenizer
+from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
 from .test_tokenization_common import TokenizerTesterMixin
@@ -11,6 +11,8 @@ from .test_tokenization_common import TokenizerTesterMixin
 class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = PegasusTokenizer
+    rust_tokenizer_class = PegasusTokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_reformer.py
+++ b/tests/test_tokenization_reformer.py
@@ -19,7 +19,7 @@ import unittest
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow
-from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer
+from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
 from .test_tokenization_common import TokenizerTesterMixin
@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
 class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = ReformerTokenizer
+    rust_tokenizer_class = ReformerTokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()
@@ -37,6 +39,28 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        sequence = "I was born in 92000, and this is falsé."
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
    def test_full_tokenizer(self):
        tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)

--- a/tests/test_tokenization_roberta.py
+++ b/tests/test_tokenization_roberta.py
@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
 class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = RobertaTokenizer
+    rust_tokenizer_class = RobertaTokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_t5.py
+++ b/tests/test_tokenization_t5.py
@@ -20,13 +20,12 @@ import unittest
 from transformers import BatchEncoding
 from transformers.file_utils import cached_property
 from transformers.testing_utils import _torch_available
-from transformers.tokenization_t5 import T5Tokenizer
+from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast
+from transformers.tokenization_xlnet import SPIECE_UNDERLINE
 from .test_tokenization_common import TokenizerTesterMixin
-SPIECE_UNDERLINE = "▁"
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
 FRAMEWORK = "pt" if _torch_available else "tf"
@@ -35,6 +34,8 @@ FRAMEWORK = "pt" if _torch_available else "tf"
 class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = T5Tokenizer
+    rust_tokenizer_class = T5TokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()
@@ -113,6 +114,38 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    def t5_base_tokenizer(self):
        return T5Tokenizer.from_pretrained("t5-base")
+    @cached_property
+    def t5_base_tokenizer_fast(self):
+        return T5TokenizerFast.from_pretrained("t5-base")
+    def get_tokenizer(self, **kwargs) -> T5Tokenizer:
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
+    def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
+        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        sequence = "I was born in 92000, and this is falsé."
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
    def test_eos_treatment(self):
        tokenizer = self.t5_base_tokenizer
        batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"])

--- a/tests/test_tokenization_transfo_xl.py
+++ b/tests/test_tokenization_transfo_xl.py
@@ -17,20 +17,15 @@
 import os
 import unittest
-from transformers import is_torch_available
+from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
-from transformers.testing_utils import require_torch
 from .test_tokenization_common import TokenizerTesterMixin
-if is_torch_available():
-    from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
-@require_torch
 class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
+    tokenizer_class = TransfoXLTokenizer
+    test_rust_tokenizer = False
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_xlm.py
+++ b/tests/test_tokenization_xlm.py
@@ -27,6 +27,7 @@ from .test_tokenization_common import TokenizerTesterMixin
 class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = XLMTokenizer
+    test_rust_tokenizer = False
    def setUp(self):
        super().setUp()

--- a/tests/test_tokenization_xlm_roberta.py
+++ b/tests/test_tokenization_xlm_roberta.py
@@ -19,7 +19,7 @@ import unittest
 from transformers.file_utils import cached_property
 from transformers.testing_utils import slow
-from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer
+from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
 from .test_tokenization_common import TokenizerTesterMixin
@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
 class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = XLMRobertaTokenizer
+    rust_tokenizer_class = XLMRobertaTokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()
@@ -118,6 +120,28 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    def big_tokenizer(self):
        return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        sequence = "I was born in 92000, and this is falsé."
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
    @slow
    def test_tokenization_base_easy_symbols(self):
        symbols = "Hello World!"

--- a/tests/test_tokenization_xlnet.py
+++ b/tests/test_tokenization_xlnet.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from transformers.testing_utils import slow
-from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
+from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
 from .test_tokenization_common import TokenizerTesterMixin
@@ -29,12 +29,15 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
 class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = XLNetTokenizer
+    rust_tokenizer_class = XLNetTokenizerFast
+    test_rust_tokenizer = True
    def setUp(self):
        super().setUp()
        # We have a SentencePiece fixture for testing
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.sanitize_special_tokens()
        tokenizer.save_pretrained(self.tmpdirname)
    def test_full_tokenizer(self):