[cleanup] Hoist ModelTester objects to top level (#4939)

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

[cleanup] Hoist ModelTester objects to top level (#4939)
Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
c852036b · Amil Khare · GitHub · 0c55a384 · c852036b · c852036b
Unverified Commit c852036b authored Jun 16, 2020 by Amil Khare Committed by GitHub Jun 16, 2020
20 changed files
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -37,6 +37,226 @@ if is_torch_available():
    from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST


+class AlbertModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.embedding_size = 16
+        self.hidden_size = 36
+        self.num_hidden_layers = 6
+        self.num_hidden_groups = 6
+        self.num_attention_heads = 6
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = AlbertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            num_hidden_groups=self.num_hidden_groups,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def create_and_check_albert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
+        sequence_output, pooled_output = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output,
+            "pooled_output": pooled_output,
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+    def create_and_check_albert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, prediction_scores, sop_scores = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            sentence_order_label=sequence_labels,
+        )
+        result = {
+            "loss": loss,
+            "prediction_scores": prediction_scores,
+            "sop_scores": sop_scores,
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.parent.assertListEqual(list(result["sop_scores"].size()), [self.batch_size, config.num_labels])
+        self.check_loss_output(result)
+
+    def create_and_check_albert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, prediction_scores = model(
+            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+        )
+        result = {
+            "loss": loss,
+            "prediction_scores": prediction_scores,
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.check_loss_output(result)
+
+    def create_and_check_albert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, start_logits, end_logits = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        result = {
+            "loss": loss,
+            "start_logits": start_logits,
+            "end_logits": end_logits,
+        }
+        self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
+        self.check_loss_output(result)
+
+    def create_and_check_albert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = AlbertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        loss, logits = model(
+            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+        )
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
+        self.check_loss_output(result)
+
+    def create_and_check_albert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = AlbertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
+        self.check_loss_output(result)
+
+    def create_and_check_albert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = AlbertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        loss, logits = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_torch
 class AlbertModelTest(ModelTesterMixin, unittest.TestCase):

@@ -54,256 +274,8 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
        else ()
    )

-    class AlbertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            embedding_size=16,
-            hidden_size=36,
-            num_hidden_layers=6,
-            num_hidden_groups=6,
-            num_attention_heads=6,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.embedding_size = embedding_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-            self.num_hidden_groups = num_hidden_groups
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = AlbertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-                num_hidden_groups=self.num_hidden_groups,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_albert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = AlbertModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-        def create_and_check_albert_for_pretraining(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = AlbertForPreTraining(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores, sop_scores = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                labels=token_labels,
-                sentence_order_label=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-                "sop_scores": sop_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["sop_scores"].size()), [self.batch_size, config.num_labels])
-            self.check_loss_output(result)
-
-        def create_and_check_albert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = AlbertForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_albert_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = AlbertForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, start_logits, end_logits = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_albert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = AlbertForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-
-        def create_and_check_albert_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = AlbertForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_albert_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_choices = self.num_choices
-            model = AlbertForMultipleChoice(config=config)
-            model.to(torch_device)
-            model.eval()
-            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss, logits = model(
-                multiple_choice_inputs_ids,
-                attention_mask=multiple_choice_input_mask,
-                token_type_ids=multiple_choice_token_type_ids,
-                labels=choice_labels,
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = AlbertModelTest.AlbertModelTester(self)
+        self.model_tester = AlbertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)

    def test_config(self):

--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -27,6 +27,140 @@ if is_torch_available():
    from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, CTRLLMHeadModel


+class CTRLModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 14
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = CTRLConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = CTRLModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        model(input_ids, token_type_ids=token_type_ids)
+        sequence_output, presents = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output,
+            "presents": presents,
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertEqual(len(result["presents"]), config.n_layer)
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = CTRLLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+        result = {"loss": loss, "lm_logits": lm_logits}
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(
+            list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
+
+        return config, inputs_dict
+
+
 @require_torch
 class CTRLModelTest(ModelTesterMixin, unittest.TestCase):

@@ -37,164 +171,8 @@ class CTRLModelTest(ModelTesterMixin, unittest.TestCase):
    test_resize_embeddings = False
    test_head_masking = False

-    class CTRLModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=14,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = CTRLConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = CTRLModel(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-            model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, presents = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "presents": presents,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertEqual(len(result["presents"]), config.n_layer)
-
-        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = CTRLLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
-
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = CTRLModelTest.CTRLModelTester(self)
+        self.model_tester = CTRLModelTester(self)
        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)

    def test_config(self):

--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -34,27 +34,6 @@ if is_torch_available():
        DistilBertForSequenceClassification,
    )

-
-@require_torch
-class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            DistilBertModel,
-            DistilBertForMaskedLM,
-            DistilBertForMultipleChoice,
-            DistilBertForQuestionAnswering,
-            DistilBertForSequenceClassification,
-            DistilBertForTokenClassification,
-        )
-        if is_torch_available()
-        else None
-    )
-    test_pruning = True
-    test_torchscript = True
-    test_resize_embeddings = True
-    test_head_masking = True
-
    class DistilBertModelTester(object):
        def __init__(
            self,
@@ -245,8 +224,29 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
            return config, inputs_dict

+
+@require_torch
+class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            DistilBertModel,
+            DistilBertForMaskedLM,
+            DistilBertForMultipleChoice,
+            DistilBertForQuestionAnswering,
+            DistilBertForSequenceClassification,
+            DistilBertForTokenClassification,
+        )
+        if is_torch_available()
+        else None
+    )
+    test_pruning = True
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_head_masking = True
+
    def setUp(self):
-        self.model_tester = DistilBertModelTest.DistilBertModelTester(self)
+        self.model_tester = DistilBertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)

    def test_config(self):

--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -36,200 +36,69 @@ if is_torch_available():
    from transformers.modeling_electra import ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST


-@require_torch
-class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            ElectraModel,
-            ElectraForPreTraining,
-            ElectraForMaskedLM,
-            ElectraForTokenClassification,
-            ElectraForSequenceClassification,
-            ElectraForQuestionAnswering,
+class ElectraModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
+
+        config = ElectraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
        )
-        if is_torch_available()
-        else ()
-    )

-    class ElectraModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-                fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
-
-            config = ElectraConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                is_decoder=False,
-                initializer_range=self.initializer_range,
-            )
-
-            return (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-                fake_token_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_electra_model(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        ):
-            model = ElectraModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            (sequence_output,) = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            (sequence_output,) = model(input_ids, token_type_ids=token_type_ids)
-            (sequence_output,) = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_electra_for_masked_lm(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        ):
-            model = ElectraForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_electra_for_token_classification(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        ):
-            config.num_labels = self.num_labels
-            model = ElectraForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_electra_for_pretraining(
-            self,
+        return (
            config,
            input_ids,
            token_type_ids,
@@ -238,48 +107,168 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
            token_labels,
            choice_labels,
            fake_token_labels,
-        ):
-            config.num_labels = self.num_labels
-            model = ElectraForPreTraining(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_electra_for_sequence_classification(
-            self,
-            config,
+        )
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def create_and_check_electra_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = ElectraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        (sequence_output,) = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        (sequence_output,) = model(input_ids, token_type_ids=token_type_ids)
+        (sequence_output,) = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output,
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_electra_for_masked_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = ElectraForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, prediction_scores = model(
+            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+        )
+        result = {
+            "loss": loss,
+            "prediction_scores": prediction_scores,
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.check_loss_output(result)
+
+    def create_and_check_electra_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = ElectraForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
+        self.check_loss_output(result)
+
+    def create_and_check_electra_for_pretraining(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = ElectraForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, logits = model(
+            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels
+        )
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length])
+        self.check_loss_output(result)
+
+    def create_and_check_electra_for_sequence_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = ElectraForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        loss, logits = model(
+            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+        )
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
+        self.check_loss_output(result)
+
+    def create_and_check_electra_for_question_answering(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = ElectraForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, start_logits, end_logits = model(
            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        ):
-            config.num_labels = self.num_labels
-            model = ElectraForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-
-        def create_and_check_electra_for_question_answering(
-            self,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        result = {
+            "loss": loss,
+            "start_logits": start_logits,
+            "end_logits": end_logits,
+        }
+        self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
+        self.check_loss_output(result)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
            config,
            input_ids,
            token_type_ids,
@@ -288,43 +277,29 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
            token_labels,
            choice_labels,
            fake_token_labels,
-        ):
-            model = ElectraForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, start_logits, end_logits = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-                fake_token_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            ElectraModel,
+            ElectraForPreTraining,
+            ElectraForMaskedLM,
+            ElectraForTokenClassification,
+            ElectraForSequenceClassification,
+            ElectraForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )

    def setUp(self):
-        self.model_tester = ElectraModelTest.ElectraModelTester(self)
+        self.model_tester = ElectraModelTester(self)
        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)

    def test_config(self):

--- a/tests/test_modeling_flaubert.py
+++ b/tests/test_modeling_flaubert.py
@@ -35,140 +35,81 @@ if is_torch_available():
    from transformers.modeling_flaubert import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST


-@require_torch
-class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            FlaubertModel,
-            FlaubertWithLMHeadModel,
-            FlaubertForQuestionAnswering,
-            FlaubertForQuestionAnsweringSimple,
-            FlaubertForSequenceClassification,
+class FlaubertModelTester(object):
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_lengths = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.gelu_activation = True
+        self.sinusoidal_embeddings = False
+        self.causal = False
+        self.asm = False
+        self.n_langs = 2
+        self.vocab_size = 99
+        self.n_special = 0
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 12
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.summary_type = "last"
+        self.use_proj = None
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
+
+        input_lengths = None
+        if self.use_input_lengths:
+            input_lengths = (
+                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+            )  # small variation of seq_length
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+        sequence_labels = None
+        token_labels = None
+        is_impossible_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+
+        config = FlaubertConfig(
+            vocab_size=self.vocab_size,
+            n_special=self.n_special,
+            emb_dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            gelu_activation=self.gelu_activation,
+            sinusoidal_embeddings=self.sinusoidal_embeddings,
+            asm=self.asm,
+            causal=self.causal,
+            n_langs=self.n_langs,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            summary_type=self.summary_type,
+            use_proj=self.use_proj,
        )
-        if is_torch_available()
-        else ()
-    )

-    class FlaubertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_lengths=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            gelu_activation=True,
-            sinusoidal_embeddings=False,
-            causal=False,
-            asm=False,
-            n_langs=2,
-            vocab_size=99,
-            n_special=0,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            summary_type="last",
-            use_proj=True,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_lengths = use_input_lengths
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.gelu_activation = gelu_activation
-            self.sinusoidal_embeddings = sinusoidal_embeddings
-            self.asm = asm
-            self.n_langs = n_langs
-            self.vocab_size = vocab_size
-            self.n_special = n_special
-            self.summary_type = summary_type
-            self.causal = causal
-            self.use_proj = use_proj
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.n_langs = n_langs
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.summary_type = summary_type
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
-
-            input_lengths = None
-            if self.use_input_lengths:
-                input_lengths = (
-                    ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
-                )  # small variation of seq_length
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
-
-            sequence_labels = None
-            token_labels = None
-            is_impossible_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
-
-            config = FlaubertConfig(
-                vocab_size=self.vocab_size,
-                n_special=self.n_special,
-                emb_dim=self.hidden_size,
-                n_layers=self.num_hidden_layers,
-                n_heads=self.num_attention_heads,
-                dropout=self.hidden_dropout_prob,
-                attention_dropout=self.attention_probs_dropout_prob,
-                gelu_activation=self.gelu_activation,
-                sinusoidal_embeddings=self.sinusoidal_embeddings,
-                asm=self.asm,
-                causal=self.causal,
-                n_langs=self.n_langs,
-                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range,
-                summary_type=self.summary_type,
-                use_proj=self.use_proj,
-            )
-
-            return (
-                config,
-                input_ids,
-                token_type_ids,
-                input_lengths,
-                sequence_labels,
-                token_labels,
-                is_impossible_labels,
-                input_mask,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_flaubert_model(
-            self,
+        return (
            config,
            input_ids,
            token_type_ids,
@@ -177,146 +118,185 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
            token_labels,
            is_impossible_labels,
            input_mask,
-        ):
-            model = FlaubertModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
-            outputs = model(input_ids, langs=token_type_ids)
-            outputs = model(input_ids)
-            sequence_output = outputs[0]
-            result = {
-                "sequence_output": sequence_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_flaubert_lm_head(
-            self,
-            config,
+        )
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def create_and_check_flaubert_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        model = FlaubertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
+        outputs = model(input_ids, langs=token_type_ids)
+        outputs = model(input_ids)
+        sequence_output = outputs[0]
+        result = {
+            "sequence_output": sequence_output,
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_flaubert_lm_head(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        model = FlaubertWithLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
+
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size])
+
+    def create_and_check_flaubert_simple_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        model = FlaubertForQuestionAnsweringSimple(config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs = model(input_ids)
+
+        outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
+        loss, start_logits, end_logits = outputs
+
+        result = {
+            "loss": loss,
+            "start_logits": start_logits,
+            "end_logits": end_logits,
+        }
+        self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
+        self.check_loss_output(result)
+
+    def create_and_check_flaubert_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        model = FlaubertForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs = model(input_ids)
+        start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
+
+        outputs = model(
            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = FlaubertWithLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+            p_mask=input_mask,
+        )

-            loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
+        outputs = model(
+            input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+        )

-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
+        (total_loss,) = outputs

-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
+        outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)

-        def create_and_check_flaubert_simple_qa(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = FlaubertForQuestionAnsweringSimple(config)
-            model.to(torch_device)
-            model.eval()
-
-            outputs = model(input_ids)
-
-            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-            loss, start_logits, end_logits = outputs
-
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_flaubert_qa(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = FlaubertForQuestionAnswering(config)
-            model.to(torch_device)
-            model.eval()
-
-            outputs = model(input_ids)
-            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
-
-            outputs = model(
-                input_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-                cls_index=sequence_labels,
-                is_impossible=is_impossible_labels,
-                p_mask=input_mask,
-            )
-
-            outputs = model(
-                input_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-                cls_index=sequence_labels,
-                is_impossible=is_impossible_labels,
-            )
-
-            (total_loss,) = outputs
-
-            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-
-            (total_loss,) = outputs
-
-            result = {
-                "loss": total_loss,
-                "start_top_log_probs": start_top_log_probs,
-                "start_top_index": start_top_index,
-                "end_top_log_probs": end_top_log_probs,
-                "end_top_index": end_top_index,
-                "cls_logits": cls_logits,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
-            )
-            self.parent.assertListEqual(
-                list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
-            )
-            self.parent.assertListEqual(
-                list(result["end_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
-            )
-            self.parent.assertListEqual(
-                list(result["end_top_index"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
-            )
-            self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
-
-        def create_and_check_flaubert_sequence_classif(
-            self,
+        (total_loss,) = outputs
+
+        result = {
+            "loss": total_loss,
+            "start_top_log_probs": start_top_log_probs,
+            "start_top_index": start_top_index,
+            "end_top_log_probs": end_top_log_probs,
+            "end_top_index": end_top_index,
+            "cls_logits": cls_logits,
+        }
+
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(
+            list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
+        )
+        self.parent.assertListEqual(
+            list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
+        )
+        self.parent.assertListEqual(
+            list(result["end_top_log_probs"].size()),
+            [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+        )
+        self.parent.assertListEqual(
+            list(result["end_top_index"].size()), [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+        )
+        self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
+
+    def create_and_check_flaubert_sequence_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        model = FlaubertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        (logits,) = model(input_ids)
+        loss, logits = model(input_ids, labels=sequence_labels)
+
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
            config,
            input_ids,
            token_type_ids,
@@ -325,41 +305,28 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
            token_labels,
            is_impossible_labels,
            input_mask,
-        ):
-            model = FlaubertForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-
-            (logits,) = model(input_ids)
-            loss, logits = model(input_ids, labels=sequence_labels)
-
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_lengths,
-                sequence_labels,
-                token_labels,
-                is_impossible_labels,
-                input_mask,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
-            return config, inputs_dict
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
+        return config, inputs_dict
+
+
+@require_torch
+class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            FlaubertModel,
+            FlaubertWithLMHeadModel,
+            FlaubertForQuestionAnswering,
+            FlaubertForQuestionAnsweringSimple,
+            FlaubertForSequenceClassification,
+        )
+        if is_torch_available()
+        else ()
+    )

    def setUp(self):
-        self.model_tester = FlaubertModelTest.FlaubertModelTester(self)
+        self.model_tester = FlaubertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)

    def test_config(self):

--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -34,6 +34,269 @@ if is_torch_available():
    )


+class GPT2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=True,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = 14
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0, 1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = GPT2Config(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPT2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        model(input_ids, token_type_ids=token_type_ids)
+        sequence_output, presents = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output,
+            "presents": presents,
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
+        )
+        self.parent.assertEqual(len(result["presents"]), config.n_layer)
+
+    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPT2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        output, past = model(input_ids, token_type_ids=token_type_ids)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+
+        output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
+        output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_gpt2_model_attention_mask_past(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = GPT2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        half_seq_length = self.seq_length // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
+        output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPT2LMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+        result = {"loss": loss, "lm_logits": lm_logits}
+
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(
+            list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
+        )
+
+    def create_and_check_double_lm_head_model(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+    ):
+        model = GPT2DoubleHeadsModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "mc_token_ids": mc_token_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+            "labels": multiple_choice_inputs_ids,
+        }
+
+        loss, lm_logits, mc_logits, _ = model(**inputs)
+
+        result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
+
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(
+            list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
+        )
+        self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "head_mask": head_mask,
+        }
+
+        return config, inputs_dict
+
+
 @require_torch
 class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):

@@ -42,271 +305,8 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
        (GPT2LMHeadModel,) if is_torch_available() else ()
    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly

-    class GPT2ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=14,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-            self.bos_token_id = vocab_size - 1
-            self.eos_token_id = vocab_size - 1
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = GPT2Config(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings,
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-                bos_token_id=self.bos_token_id,
-                eos_token_id=self.eos_token_id,
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = GPT2Model(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-            model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, presents = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "presents": presents,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
-            )
-            self.parent.assertEqual(len(result["presents"]), config.n_layer)
-
-        def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = GPT2Model(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            # first forward pass
-            output, past = model(input_ids, token_type_ids=token_type_ids)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-            next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-            # append to next input_ids and token_type_ids
-            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-            next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
-
-            output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
-            output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
-
-            # select random slice
-            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-            # test that outputs are equal for slice
-            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        def create_and_check_gpt2_model_attention_mask_past(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-        ):
-            model = GPT2Model(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            # create attention mask
-            attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-            half_seq_length = self.seq_length // 2
-            attn_mask[:, half_seq_length:] = 0
-
-            # first forward pass
-            output, past = model(input_ids, attention_mask=attn_mask)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # change a random masked slice from input_ids
-            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-            random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-            input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-            # append to next input_ids and attn_mask
-            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-            attn_mask = torch.cat(
-                [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
-            )
-
-            # get two different outputs
-            output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
-            output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
-
-            # select random slice
-            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-            # test that outputs are equal for slice
-            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = GPT2LMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
-            )
-
-        def create_and_check_double_lm_head_model(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-        ):
-            model = GPT2DoubleHeadsModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "mc_token_ids": mc_token_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-                "labels": multiple_choice_inputs_ids,
-            }
-
-            loss, lm_logits, mc_logits, _ = model(**inputs)
-
-            result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
-            )
-            self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {
-                "input_ids": input_ids,
-                "token_type_ids": token_type_ids,
-                "head_mask": head_mask,
-            }
-
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
+        self.model_tester = GPT2ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)

    def test_config(self):

--- a/tests/test_modeling_longformer.py
+++ b/tests/test_modeling_longformer.py
@@ -36,56 +36,33 @@ if is_torch_available():
    )


-class LongformerModelTester(object):
+class LongformerModelTester:
    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        attention_window=4,
+        self, parent,
    ):
        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.attention_window = attention_window
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.attention_window = 4

        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
        # [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention

--- a/tests/test_modeling_openai.py
+++ b/tests/test_modeling_openai.py
@@ -34,6 +34,139 @@ if is_torch_available():
    )


+class OpenAIGPTModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = OpenAIGPTConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        model = OpenAIGPTModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        model(input_ids, token_type_ids=token_type_ids)
+        (sequence_output,) = model(input_ids)
+
+        result = {"sequence_output": sequence_output}
+        self.parent.assertListEqual(
+            list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
+        )
+
+    def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        model = OpenAIGPTLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+        result = {"loss": loss, "lm_logits": lm_logits}
+
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(
+            list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
+        )
+
+    def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        model = OpenAIGPTDoubleHeadsModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+        result = {"loss": loss, "lm_logits": lm_logits}
+
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+        self.parent.assertListEqual(
+            list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "head_mask": head_mask,
+        }
+
+        return config, inputs_dict
+
+
 @require_torch
 class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase):

@@ -44,161 +177,8 @@ class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase):
        (OpenAIGPTLMHeadModel,) if is_torch_available() else ()
    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly

-    class OpenAIGPTModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = OpenAIGPTConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                head_mask,
-                token_type_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
-            model = OpenAIGPTModel(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-            model(input_ids, token_type_ids=token_type_ids)
-            (sequence_output,) = model(input_ids)
-
-            result = {"sequence_output": sequence_output}
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
-            )
-
-        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
-            model = OpenAIGPTLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
-            )
-
-        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
-            model = OpenAIGPTDoubleHeadsModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                head_mask,
-                token_type_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {
-                "input_ids": input_ids,
-                "token_type_ids": token_type_ids,
-                "head_mask": head_mask,
-            }
-
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
+        self.model_tester = OpenAIGPTModelTester(self)
        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)

    def test_config(self):

--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -39,6 +39,183 @@ if is_torch_available():
    from transformers.modeling_utils import create_position_ids_from_input_ids


+class RobertaModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = RobertaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def create_and_check_roberta_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RobertaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
+        sequence_output, pooled_output = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output,
+            "pooled_output": pooled_output,
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+    def create_and_check_roberta_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RobertaForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, prediction_scores = model(
+            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+        )
+        result = {
+            "loss": loss,
+            "prediction_scores": prediction_scores,
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.check_loss_output(result)
+
+    def create_and_check_roberta_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = RobertaForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
+        self.check_loss_output(result)
+
+    def create_and_check_roberta_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = RobertaForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        loss, logits = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        result = {
+            "loss": loss,
+            "logits": logits,
+        }
+        self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
+        self.check_loss_output(result)
+
+    def create_and_check_roberta_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RobertaForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        loss, start_logits, end_logits = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        result = {
+            "loss": loss,
+            "start_logits": start_logits,
+            "end_logits": end_logits,
+        }
+        self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
+        self.check_loss_output(result)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_torch
 class RobertaModelTest(ModelTesterMixin, unittest.TestCase):

@@ -55,210 +232,8 @@ class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
        else ()
    )

-    class RobertaModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = RobertaConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_roberta_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = RobertaModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-        def create_and_check_roberta_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = RobertaForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_roberta_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = RobertaForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_roberta_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_choices = self.num_choices
-            model = RobertaForMultipleChoice(config=config)
-            model.to(torch_device)
-            model.eval()
-            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss, logits = model(
-                multiple_choice_inputs_ids,
-                attention_mask=multiple_choice_input_mask,
-                token_type_ids=multiple_choice_token_type_ids,
-                labels=choice_labels,
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
-            self.check_loss_output(result)
-
-        def create_and_check_roberta_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = RobertaForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, start_logits, end_logits = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = RobertaModelTest.RobertaModelTester(self)
+        self.model_tester = RobertaModelTester(self)
        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)

    def test_config(self):

--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -30,6 +30,268 @@ if is_torch_available():
    from transformers.tokenization_t5 import T5Tokenizer


+class T5ModelTester:
+    def __init__(self, parent):
+        self.parent = parent
+        self.batch_size = 13
+        self.encoder_seq_length = 7
+        self.decoder_seq_length = 9
+        self.is_training = True
+        self.use_attention_mask = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.n_positions = 14
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.d_ff = 37
+        self.relative_attention_num_buckets = 8
+        self.dropout_rate = 0.1
+        self.initializer_factor = 0.002
+        self.eos_token_id = 1
+        self.pad_token_id = 0
+        self.decoder_start_token_id = 0
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = T5Config(
+            vocab_size=self.vocab_size,
+            n_positions=self.n_positions,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result["loss"].size()), [])
+
+    def check_prepare_lm_labels_via_shift_left(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # make sure that lm_labels are correctly padded from the right
+        lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
+
+        # add casaul pad token mask
+        triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
+        lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
+        decoder_input_ids = model._shift_right(lm_labels)
+
+        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
+            # first item
+            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
+            if i < decoder_input_ids_slice.shape[-1]:
+                if i < decoder_input_ids.shape[-1] - 1:
+                    # items before diagonal
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
+                    )
+                # pad items after diagonal
+                if i < decoder_input_ids.shape[-1] - 2:
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
+                    )
+            else:
+                # all items after square
+                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
+
+    def create_and_check_t5_model(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        decoder_output, decoder_past, encoder_output = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        decoder_output, decoder_past, encoder_output = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+        result = {
+            "encoder_output": encoder_output,
+            "decoder_output": decoder_output,
+            "decoder_past": decoder_past,
+        }
+        self.parent.assertListEqual(
+            list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(
+            list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
+        )
+        self.parent.assertEqual(len(decoder_past), 2)
+        # decoder_past[0] should correspond to encoder output
+        self.parent.assertTrue(torch.all(decoder_past[0][0] == encoder_output))
+        # There should be `num_layers` key value embeddings stored in decoder_past[1]
+        self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
+        self.parent.assertEqual(len(decoder_past[1][0]), 4)
+
+    def create_and_check_t5_with_lm_head(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5ForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        loss, prediction_scores, _, _ = outputs
+        self.parent.assertEqual(len(outputs), 4)
+        result = {
+            "loss": loss,
+            "prediction_scores": prediction_scores,
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
+        )
+        self.check_loss_output(result)
+
+    def create_and_check_t5_decoder_model_past(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config).get_decoder()
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        output, past_key_value_states = model(input_ids, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)[0]
+        output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_t5_decoder_model_attention_mask_past(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config).get_decoder()
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
+        output_from_past = model(next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_t5_and_check_t5_generate_with_past_key_value_states(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5ForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def create_and_check_t5_model_fp16_forward(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
 @require_torch
 class T5ModelTest(ModelTesterMixin, unittest.TestCase):

@@ -40,302 +302,8 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
    test_resize_embeddings = False
    is_encoder_decoder = True

-    class T5ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            encoder_seq_length=7,
-            decoder_seq_length=9,
-            is_training=True,
-            use_attention_mask=True,
-            use_labels=True,
-            vocab_size=99,
-            n_positions=14,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            d_ff=37,
-            relative_attention_num_buckets=8,
-            dropout_rate=0.1,
-            initializer_factor=0.002,
-            eos_token_id=1,
-            pad_token_id=0,
-            decoder_start_token_id=0,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.encoder_seq_length = encoder_seq_length
-            self.decoder_seq_length = decoder_seq_length
-            self.is_training = is_training
-            self.use_attention_mask = use_attention_mask
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_positions = n_positions
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.d_ff = d_ff
-            self.relative_attention_num_buckets = relative_attention_num_buckets
-            self.dropout_rate = dropout_rate
-            self.initializer_factor = initializer_factor
-            self.scope = scope
-            self.eos_token_id = eos_token_id
-            self.pad_token_id = pad_token_id
-            self.decoder_start_token_id = decoder_start_token_id
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-            attention_mask = None
-            decoder_attention_mask = None
-            if self.use_attention_mask:
-                attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-            lm_labels = None
-            if self.use_labels:
-                lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-            config = T5Config(
-                vocab_size=self.vocab_size,
-                n_positions=self.n_positions,
-                d_model=self.hidden_size,
-                d_ff=self.d_ff,
-                d_kv=self.hidden_size // self.num_attention_heads,
-                num_layers=self.num_hidden_layers,
-                num_heads=self.num_attention_heads,
-                relative_attention_num_buckets=self.relative_attention_num_buckets,
-                dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor,
-                eos_token_id=self.eos_token_id,
-                bos_token_id=self.pad_token_id,
-                pad_token_id=self.pad_token_id,
-                decoder_start_token_id=self.decoder_start_token_id,
-            )
-
-            return (
-                config,
-                input_ids,
-                decoder_input_ids,
-                attention_mask,
-                decoder_attention_mask,
-                lm_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def check_prepare_lm_labels_via_shift_left(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            # make sure that lm_labels are correctly padded from the right
-            lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-            # add casaul pad token mask
-            triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-            lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
-            decoder_input_ids = model._shift_right(lm_labels)
-
-            for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-                # first item
-                self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-                if i < decoder_input_ids_slice.shape[-1]:
-                    if i < decoder_input_ids.shape[-1] - 1:
-                        # items before diagonal
-                        self.parent.assertListEqual(
-                            decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                        )
-                    # pad items after diagonal
-                    if i < decoder_input_ids.shape[-1] - 2:
-                        self.parent.assertListEqual(
-                            decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                        )
-                else:
-                    # all items after square
-                    self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-        def create_and_check_t5_model(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config)
-            model.to(torch_device)
-            model.eval()
-            decoder_output, decoder_past, encoder_output = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            decoder_output, decoder_past, encoder_output = model(
-                input_ids=input_ids, decoder_input_ids=decoder_input_ids
-            )
-
-            result = {
-                "encoder_output": encoder_output,
-                "decoder_output": decoder_output,
-                "decoder_past": decoder_past,
-            }
-            self.parent.assertListEqual(
-                list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
-            )
-            self.parent.assertEqual(len(decoder_past), 2)
-            # decoder_past[0] should correspond to encoder output
-            self.parent.assertTrue(torch.all(decoder_past[0][0] == encoder_output))
-            # There should be `num_layers` key value embeddings stored in decoder_past[1]
-            self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
-            # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
-            self.parent.assertEqual(len(decoder_past[1][0]), 4)
-
-        def create_and_check_t5_with_lm_head(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5ForConditionalGeneration(config=config)
-            model.to(torch_device)
-            model.eval()
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                labels=lm_labels,
-            )
-            loss, prediction_scores, _, _ = outputs
-            self.parent.assertEqual(len(outputs), 4)
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_t5_decoder_model_past(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config).get_decoder()
-            model.to(torch_device)
-            model.eval()
-
-            # first forward pass
-            output, past_key_value_states = model(input_ids, use_cache=True)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # append to next input_ids and
-            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-            output_from_no_past = model(next_input_ids)[0]
-            output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
-
-            # select random slice
-            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-            # test that outputs are equal for slice
-            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        def create_and_check_t5_decoder_model_attention_mask_past(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config).get_decoder()
-            model.to(torch_device)
-            model.eval()
-
-            # create attention mask
-            attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-
-            half_seq_length = input_ids.shape[-1] // 2
-            attn_mask[:, half_seq_length:] = 0
-
-            # first forward pass
-            output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # change a random masked slice from input_ids
-            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-            random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-            input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-            # append to next input_ids and attn_mask
-            next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-            attn_mask = torch.cat(
-                [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
-            )
-
-            # get two different outputs
-            output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
-            output_from_past = model(
-                next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask
-            )[0]
-
-            # select random slice
-            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-            # test that outputs are equal for slice
-            self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        def create_t5_and_check_t5_generate_with_past_key_value_states(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5ForConditionalGeneration(config=config)
-            model.to(torch_device)
-            model.eval()
-            torch.manual_seed(0)
-            output_without_past_cache = model.generate(
-                input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-            )
-            torch.manual_seed(0)
-            output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-            self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
-
-        def create_and_check_t5_model_fp16_forward(
-            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
-        ):
-            model = T5Model(config=config)
-            model.to(torch_device)
-            model.half()
-            model.eval()
-            output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0]
-            self.parent.assertFalse(torch.isnan(output).any().item())
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                decoder_input_ids,
-                attention_mask,
-                decoder_attention_mask,
-                lm_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {
-                "input_ids": input_ids,
-                "attention_mask": attention_mask,
-                "decoder_input_ids": decoder_input_ids,
-                "decoder_attention_mask": decoder_attention_mask,
-                "use_cache": False,
-            }
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = T5ModelTest.T5ModelTester(self)
+        self.model_tester = T5ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)

    def test_config(self):

--- a/tests/test_modeling_tf_albert.py
+++ b/tests/test_modeling_tf_albert.py
@@ -34,6 +34,186 @@ if is_tf_available():
    )


+class TFAlbertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        embedding_size=16,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.embedding_size = 16
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = AlbertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_albert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFAlbertModel(config=config)
+        # inputs = {'input_ids': input_ids,
+        #           'attention_mask': input_mask,
+        #           'token_type_ids': token_type_ids}
+        # sequence_output, pooled_output = model(**inputs)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        sequence_output, pooled_output = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        sequence_output, pooled_output = model(inputs)
+
+        sequence_output, pooled_output = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+            "pooled_output": pooled_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+
+    def create_and_check_albert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFAlbertForPreTraining(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        prediction_scores, sop_scores = model(inputs)
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+            "sop_scores": sop_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels])
+
+    def create_and_check_albert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFAlbertForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (prediction_scores,) = model(inputs)
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_albert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFAlbertForSequenceClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
+
+    def create_and_check_albert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFAlbertForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        start_logits, end_logits = model(inputs)
+        result = {
+            "start_logits": start_logits.numpy(),
+            "end_logits": end_logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_tf
 class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -49,187 +229,8 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
        else ()
    )

-    class TFAlbertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            embedding_size=16,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.embedding_size = embedding_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = AlbertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_albert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFAlbertModel(config=config)
-            # inputs = {'input_ids': input_ids,
-            #           'attention_mask': input_mask,
-            #           'token_type_ids': token_type_ids}
-            # sequence_output, pooled_output = model(**inputs)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output, pooled_output = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            sequence_output, pooled_output = model(inputs)
-
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-                "pooled_output": pooled_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
-
-        def create_and_check_albert_for_pretraining(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFAlbertForPreTraining(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores, sop_scores = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-                "sop_scores": sop_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels])
-
-        def create_and_check_albert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFAlbertForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_albert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFAlbertForSequenceClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-
-        def create_and_check_albert_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFAlbertForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            start_logits, end_logits = model(inputs)
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
+        self.model_tester = TFAlbertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)

    def test_config(self):

--- a/tests/test_modeling_tf_bert.py
+++ b/tests/test_modeling_tf_bert.py
@@ -37,6 +37,221 @@ if is_tf_available():
    )


+class TFBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = BertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_bert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        sequence_output, pooled_output = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        sequence_output, pooled_output = model(inputs)
+
+        sequence_output, pooled_output = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+            "pooled_output": pooled_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+
+    def create_and_check_bert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (prediction_scores,) = model(inputs)
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_bert_for_next_sequence_prediction(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForNextSentencePrediction(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (seq_relationship_score,) = model(inputs)
+        result = {
+            "seq_relationship_score": seq_relationship_score.numpy(),
+        }
+        self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
+
+    def create_and_check_bert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForPreTraining(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        prediction_scores, seq_relationship_score = model(inputs)
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+            "seq_relationship_score": seq_relationship_score.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
+
+    def create_and_check_bert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFBertForSequenceClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
+
+    def create_and_check_bert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFBertForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
+
+    def create_and_check_bert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFBertForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
+    def create_and_check_bert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        start_logits, end_logits = model(inputs)
+        result = {
+            "start_logits": start_logits.numpy(),
+            "end_logits": end_logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_tf
 class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -55,224 +270,8 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
        else ()
    )

-    class TFBertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = BertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_bert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output, pooled_output = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            sequence_output, pooled_output = model(inputs)
-
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-                "pooled_output": pooled_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
-
-        def create_and_check_bert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_bert_for_next_sequence_prediction(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForNextSentencePrediction(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (seq_relationship_score,) = model(inputs)
-            result = {
-                "seq_relationship_score": seq_relationship_score.numpy(),
-            }
-            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
-
-        def create_and_check_bert_for_pretraining(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForPreTraining(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores, seq_relationship_score = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-                "seq_relationship_score": seq_relationship_score.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
-
-        def create_and_check_bert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFBertForSequenceClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-
-        def create_and_check_bert_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_choices = self.num_choices
-            model = TFBertForMultipleChoice(config=config)
-            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-            }
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
-
-        def create_and_check_bert_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFBertForTokenClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-            )
-
-        def create_and_check_bert_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            start_logits, end_logits = model(inputs)
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFBertModelTest.TFBertModelTester(self)
+        self.model_tester = TFBertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)

    def test_config(self):

--- a/tests/test_modeling_tf_ctrl.py
+++ b/tests/test_modeling_tf_ctrl.py
@@ -28,163 +28,141 @@ if is_tf_available():
    from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST


+class TFCTRLModelTester(object):
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = CTRLConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFCTRLModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        sequence_output = model(inputs)[0]
+
+        inputs = [input_ids, None, input_mask]  # None is the input for 'past'
+        sequence_output = model(inputs)[0]
+
+        sequence_output = model(input_ids)[0]
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFCTRLLMHeadModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        prediction_scores = model(inputs)[0]
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_tf
 class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
    all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()

-    class TFCTRLModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = CTRLConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFCTRLModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFCTRLLMHeadModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores = model(inputs)[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFCTRLModelTest.TFCTRLModelTester(self)
+        self.model_tester = TFCTRLModelTester(self)
        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)

    def test_config(self):

--- a/tests/test_modeling_tf_distilbert.py
+++ b/tests/test_modeling_tf_distilbert.py
@@ -32,6 +32,128 @@ if is_tf_available():
    )


+class TFDistilBertModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = False
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = DistilBertConfig(
+            vocab_size=self.vocab_size,
+            dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            hidden_dim=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_distilbert_model(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDistilBertModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+
+        outputs = model(inputs)
+        sequence_output = outputs[0]
+
+        inputs = [input_ids, input_mask]
+
+        (sequence_output,) = model(inputs)
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_distilbert_for_masked_lm(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDistilBertForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        (prediction_scores,) = model(inputs)
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_distilbert_for_question_answering(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDistilBertForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        start_logits, end_logits = model(inputs)
+        result = {
+            "start_logits": start_logits.numpy(),
+            "end_logits": end_logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
+
+    def create_and_check_distilbert_for_sequence_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFDistilBertForSequenceClassification(config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_tf
 class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -50,151 +172,8 @@ class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
    test_resize_embeddings = True
    test_head_masking = True

-    class TFDistilBertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=False,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = DistilBertConfig(
-                vocab_size=self.vocab_size,
-                dim=self.hidden_size,
-                n_layers=self.num_hidden_layers,
-                n_heads=self.num_attention_heads,
-                hidden_dim=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                dropout=self.hidden_dropout_prob,
-                attention_dropout=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_distilbert_model(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFDistilBertModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-
-            outputs = model(inputs)
-            sequence_output = outputs[0]
-
-            inputs = [input_ids, input_mask]
-
-            (sequence_output,) = model(inputs)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_distilbert_for_masked_lm(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFDistilBertForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_distilbert_for_question_answering(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFDistilBertForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-            start_logits, end_logits = model(inputs)
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def create_and_check_distilbert_for_sequence_classification(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFDistilBertForSequenceClassification(config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFDistilBertModelTest.TFDistilBertModelTester(self)
+        self.model_tester = TFDistilBertModelTester(self)
        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)

    def test_config(self):

--- a/tests/test_modeling_tf_electra.py
+++ b/tests/test_modeling_tf_electra.py
@@ -32,6 +32,138 @@ if is_tf_available():
    )


+class TFElectraModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = ElectraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_electra_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFElectraModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (sequence_output,) = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        (sequence_output,) = model(inputs)
+
+        (sequence_output,) = model(input_ids)
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_electra_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFElectraForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (prediction_scores,) = model(inputs)
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_electra_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFElectraForPreTraining(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (prediction_scores,) = model(inputs)
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length])
+
+    def create_and_check_electra_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFElectraForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_tf
 class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -41,163 +173,8 @@ class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
        else ()
    )

-    class TFElectraModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = ElectraConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_electra_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFElectraModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (sequence_output,) = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            (sequence_output,) = model(inputs)
-
-            (sequence_output,) = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_electra_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFElectraForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_electra_for_pretraining(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFElectraForPreTraining(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length])
-
-        def create_and_check_electra_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFElectraForTokenClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFElectraModelTest.TFElectraModelTester(self)
+        self.model_tester = TFElectraModelTester(self)
        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)

    def test_config(self):

--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -34,268 +34,246 @@ if is_tf_available():
    )


+class TFGPT2ModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.bos_token_id = self.vocab_size - 1
+        self.eos_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = GPT2Config(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFGPT2Model(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        sequence_output = model(inputs)[0]
+
+        inputs = [input_ids, None, input_mask]  # None is the input for 'past'
+        sequence_output = model(inputs)[0]
+
+        sequence_output = model(input_ids)[0]
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size],
+        )
+
+    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFGPT2Model(config=config)
+
+        # first forward pass
+        output, past = model(input_ids, token_type_ids=token_type_ids)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
+
+        output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
+        output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
+
+    def create_and_check_gpt2_model_attention_mask_past(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = TFGPT2Model(config=config)
+
+        # create attention mask
+        half_seq_length = self.seq_length // 2
+        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
+        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
+        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
+
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
+        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
+        condition = tf.transpose(
+            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
+        )
+        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
+
+        # append to next input_ids and attn_mask
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
+
+        # get two different outputs
+        output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
+        output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
+
+    def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFGPT2LMHeadModel(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        prediction_scores = model(inputs)[0]
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size],
+        )
+
+    def create_and_check_gpt2_double_head(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+    ):
+        model = TFGPT2DoubleHeadsModel(config=config)
+
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "mc_token_ids": mc_token_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        lm_logits, mc_logits = model(inputs)[:2]
+        result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
+        self.parent.assertListEqual(
+            list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
+        )
+        self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+        return config, inputs_dict
+
+
 @require_tf
 class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
    all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()

-    class TFGPT2ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-            self.bos_token_id = vocab_size - 1
-            self.eos_token_id = vocab_size - 1
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = GPT2Config(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings,
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-                bos_token_id=self.bos_token_id,
-                eos_token_id=self.eos_token_id,
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFGPT2Model(config=config)
-            inputs = {
-                "input_ids": input_ids,
-                "attention_mask": input_mask,
-                "token_type_ids": token_type_ids,
-            }
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size],
-            )
-
-        def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFGPT2Model(config=config)
-
-            # first forward pass
-            output, past = model(input_ids, token_type_ids=token_type_ids)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-            next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-            # append to next input_ids and token_type_ids
-            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-            next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
-
-            output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
-            output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
-
-            # select random slice
-            random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-            # test that outputs are equal for slice
-            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-        def create_and_check_gpt2_model_attention_mask_past(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-        ):
-            model = TFGPT2Model(config=config)
-
-            # create attention mask
-            half_seq_length = self.seq_length // 2
-            attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-            attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-            attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-            # first forward pass
-            output, past = model(input_ids, attention_mask=attn_mask)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # change a random masked slice from input_ids
-            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-            random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-            vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-            condition = tf.transpose(
-                tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-            )
-            input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-            # append to next input_ids and attn_mask
-            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-            attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
-
-            # get two different outputs
-            output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
-            output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
-
-            # select random slice
-            random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-            # test that outputs are equal for slice
-            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
-
-        def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFGPT2LMHeadModel(config=config)
-            inputs = {
-                "input_ids": input_ids,
-                "attention_mask": input_mask,
-                "token_type_ids": token_type_ids,
-            }
-            prediction_scores = model(inputs)[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size],
-            )
-
-        def create_and_check_gpt2_double_head(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-        ):
-            model = TFGPT2DoubleHeadsModel(config=config)
-
-            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "mc_token_ids": mc_token_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-            }
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
-            self.parent.assertListEqual(
-                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
-            )
-            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {
-                "input_ids": input_ids,
-                "token_type_ids": token_type_ids,
-                "attention_mask": input_mask,
-            }
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFGPT2ModelTest.TFGPT2ModelTester(self)
+        self.model_tester = TFGPT2ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)

    def test_config(self):

--- a/tests/test_modeling_tf_openai_gpt.py
+++ b/tests/test_modeling_tf_openai_gpt.py
@@ -33,6 +33,155 @@ if is_tf_available():
    )


+class TFOpenAIGPTModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = OpenAIGPTConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFOpenAIGPTModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        sequence_output = model(inputs)[0]
+
+        inputs = [input_ids, input_mask]
+        sequence_output = model(inputs)[0]
+
+        sequence_output = model(input_ids)[0]
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFOpenAIGPTLMHeadModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        prediction_scores = model(inputs)[0]
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_openai_gpt_double_head(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+    ):
+        model = TFOpenAIGPTDoubleHeadsModel(config=config)
+
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "mc_token_ids": mc_token_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        lm_logits, mc_logits = model(inputs)[:2]
+        result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
+        self.parent.assertListEqual(
+            list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+        )
+        self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_tf
 class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -43,179 +192,8 @@ class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
        (TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly

-    class TFOpenAIGPTModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = OpenAIGPTConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFOpenAIGPTModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, input_mask]
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFOpenAIGPTLMHeadModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores = model(inputs)[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_openai_gpt_double_head(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-        ):
-            model = TFOpenAIGPTDoubleHeadsModel(config=config)
-
-            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "mc_token_ids": mc_token_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-            }
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
-            self.parent.assertListEqual(
-                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFOpenAIGPTModelTest.TFOpenAIGPTModelTester(self)
+        self.model_tester = TFOpenAIGPTModelTester(self)
        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)

    def test_config(self):

--- a/tests/test_modeling_tf_roberta.py
+++ b/tests/test_modeling_tf_roberta.py
@@ -36,6 +36,139 @@ if is_tf_available():
    )


+class TFRobertaModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = RobertaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_roberta_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRobertaModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        sequence_output = model(inputs)[0]
+
+        inputs = [input_ids, input_mask]
+        sequence_output = model(inputs)[0]
+
+        sequence_output = model(input_ids)[0]
+
+        result = {
+            "sequence_output": sequence_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+
+    def create_and_check_roberta_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRobertaForMaskedLM(config=config)
+        prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_roberta_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFRobertaForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
+    def create_and_check_roberta_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRobertaForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        start_logits, end_logits = model(inputs)
+        result = {
+            "start_logits": start_logits.numpy(),
+            "end_logits": end_logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
 @require_tf
 class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -51,164 +184,8 @@ class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
        else ()
    )

-    class TFRobertaModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = RobertaConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_roberta_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFRobertaModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, input_mask]
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_roberta_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFRobertaForMaskedLM(config=config)
-            prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_roberta_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFRobertaForTokenClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-            )
-
-        def create_and_check_roberta_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFRobertaForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            start_logits, end_logits = model(inputs)
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFRobertaModelTest.TFRobertaModelTester(self)
+        self.model_tester = TFRobertaModelTester(self)
        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)

    def test_config(self):

--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -28,6 +28,186 @@ if is_tf_available():
    from transformers import TFT5Model, TFT5ForConditionalGeneration, T5Tokenizer


+class TFT5ModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.n_positions = 14
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.d_ff = 37
+        self.relative_attention_num_buckets = 8
+        self.dropout_rate = 0.1
+        self.initializer_factor = 0.002
+        self.eos_token_id = 1
+        self.pad_token_id = 0
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_labels = None
+        if self.use_labels:
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = T5Config(
+            vocab_size=self.vocab_size,
+            n_positions=self.n_positions,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+
+        return (config, input_ids, input_mask, token_labels)
+
+    def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+        model = TFT5Model(config=config)
+        inputs = {
+            "inputs": input_ids,
+            "decoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        decoder_output, decoder_past, encoder_output = model(inputs)
+
+        decoder_output, decoder_past, encoder_output = model(
+            input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids
+        )
+        result = {
+            "encoder_output": encoder_output.numpy(),
+            "decoder_past": decoder_past,
+            "decoder_output": decoder_output.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(
+            list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertEqual(len(decoder_past), 2)
+        # decoder_past[0] should correspond to encoder output
+        self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output)))
+        # There should be `num_layers` key value embeddings stored in decoder_past[1]
+        self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
+        self.parent.assertEqual(len(decoder_past[1][0]), 4)
+
+    def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+        model = TFT5ForConditionalGeneration(config=config)
+        inputs_dict = {
+            "inputs": input_ids,
+            "decoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+
+        prediction_scores, _, _ = model(inputs_dict)
+
+        result = {
+            "prediction_scores": prediction_scores.numpy(),
+        }
+        self.parent.assertListEqual(
+            list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask):
+        model = TFT5Model(config=config).get_decoder()
+
+        input_ids = input_ids[:1, :]
+        self.batch_size = 1
+
+        # first forward pass
+        _, past_key_value_states = model(input_ids, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+
+        output_from_no_past = model(next_input_ids)[0]
+        output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+    def create_and_check_t5_decoder_model_attention_mask_past(
+        self, config, input_ids, decoder_input_ids, attention_mask
+    ):
+        model = TFT5Model(config=config).get_decoder()
+
+        # create attention mask
+        half_seq_length = self.seq_length // 2
+        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
+        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
+        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
+
+        # first forward pass
+        _, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
+        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
+        condition = tf.transpose(
+            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
+        )
+        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
+
+        # append to next input_ids and attn_mask
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        attn_mask = tf.concat([attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)], axis=1,)
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
+        output_from_past = model(next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).numpy().item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, input_mask, token_labels) = config_and_inputs
+        inputs_dict = {
+            "inputs": input_ids,
+            "decoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+            "use_cache": tf.convert_to_tensor([False]),
+        }
+        return config, inputs_dict
+
+
 @require_tf
 class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -35,207 +215,8 @@ class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
    all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else ()
    all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else ()

-    class TFT5ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_labels=True,
-            vocab_size=99,
-            n_positions=14,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            d_ff=37,
-            relative_attention_num_buckets=8,
-            dropout_rate=0.1,
-            initializer_factor=0.002,
-            eos_token_id=1,
-            pad_token_id=0,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_positions = n_positions
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.d_ff = d_ff
-            self.relative_attention_num_buckets = relative_attention_num_buckets
-            self.dropout_rate = dropout_rate
-            self.initializer_factor = initializer_factor
-            self.eos_token_id = eos_token_id
-            self.pad_token_id = pad_token_id
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_labels = None
-            if self.use_labels:
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            config = T5Config(
-                vocab_size=self.vocab_size,
-                n_positions=self.n_positions,
-                d_model=self.hidden_size,
-                d_ff=self.d_ff,
-                d_kv=self.hidden_size // self.num_attention_heads,
-                num_layers=self.num_hidden_layers,
-                num_heads=self.num_attention_heads,
-                relative_attention_num_buckets=self.relative_attention_num_buckets,
-                dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor,
-                eos_token_id=self.eos_token_id,
-                bos_token_id=self.pad_token_id,
-                pad_token_id=self.pad_token_id,
-            )
-
-            return (config, input_ids, input_mask, token_labels)
-
-        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
-            model = TFT5Model(config=config)
-            inputs = {
-                "inputs": input_ids,
-                "decoder_input_ids": input_ids,
-                "decoder_attention_mask": input_mask,
-            }
-            decoder_output, decoder_past, encoder_output = model(inputs)
-
-            decoder_output, decoder_past, encoder_output = model(
-                input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids
-            )
-            result = {
-                "encoder_output": encoder_output.numpy(),
-                "decoder_past": decoder_past,
-                "decoder_output": decoder_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertEqual(len(decoder_past), 2)
-            # decoder_past[0] should correspond to encoder output
-            self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output)))
-            # There should be `num_layers` key value embeddings stored in decoder_past[1]
-            self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
-            # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
-            self.parent.assertEqual(len(decoder_past[1][0]), 4)
-
-        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
-            model = TFT5ForConditionalGeneration(config=config)
-            inputs_dict = {
-                "inputs": input_ids,
-                "decoder_input_ids": input_ids,
-                "decoder_attention_mask": input_mask,
-            }
-
-            prediction_scores, _, _ = model(inputs_dict)
-
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask):
-            model = TFT5Model(config=config).get_decoder()
-
-            input_ids = input_ids[:1, :]
-            self.batch_size = 1
-
-            # first forward pass
-            _, past_key_value_states = model(input_ids, use_cache=True)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # append to next input_ids and
-            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-            output_from_no_past = model(next_input_ids)[0]
-            output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
-
-            # select random slice
-            random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-            # test that outputs are equal for slice
-            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-        def create_and_check_t5_decoder_model_attention_mask_past(
-            self, config, input_ids, decoder_input_ids, attention_mask
-        ):
-            model = TFT5Model(config=config).get_decoder()
-
-            # create attention mask
-            half_seq_length = self.seq_length // 2
-            attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-            attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-            attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-            # first forward pass
-            _, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-            # create hypothetical next token and extent to next_input_ids
-            next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-            # change a random masked slice from input_ids
-            random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-            random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-            vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-            condition = tf.transpose(
-                tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-            )
-            input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-            # append to next input_ids and attn_mask
-            next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-            attn_mask = tf.concat([attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)], axis=1,)
-
-            # get two different outputs
-            output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
-            output_from_past = model(
-                next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask
-            )[0]
-
-            # select random slice
-            random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).numpy().item()
-            output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-            output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-            # test that outputs are equal for slice
-            tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, token_labels) = config_and_inputs
-            inputs_dict = {
-                "inputs": input_ids,
-                "decoder_input_ids": input_ids,
-                "decoder_attention_mask": input_mask,
-                "use_cache": tf.convert_to_tensor([False]),
-            }
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
+        self.model_tester = TFT5ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)

    def test_config(self):

--- a/tests/test_modeling_tf_transfo_xl.py
+++ b/tests/test_modeling_tf_transfo_xl.py
@@ -33,6 +33,135 @@ if is_tf_available():
    )


+class TFTransfoXLModelTester:
+    def __init__(
+        self, parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.mem_len = 30
+        self.key_length = self.seq_length + self.mem_len
+        self.clamp_len = 15
+        self.is_training = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.cutoffs = [10, 50, 80]
+        self.hidden_size = 32
+        self.d_embed = 32
+        self.num_attention_heads = 4
+        self.d_head = 8
+        self.d_inner = 128
+        self.div_val = 2
+        self.num_hidden_layers = 5
+        self.scope = None
+        self.seed = 1
+        self.eos_token_id = 0
+
+    def prepare_config_and_inputs(self):
+        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = TransfoXLConfig(
+            vocab_size=self.vocab_size,
+            mem_len=self.mem_len,
+            clamp_len=self.clamp_len,
+            cutoffs=self.cutoffs,
+            d_model=self.hidden_size,
+            d_embed=self.d_embed,
+            n_head=self.num_attention_heads,
+            d_head=self.d_head,
+            d_inner=self.d_inner,
+            div_val=self.div_val,
+            n_layer=self.num_hidden_layers,
+            eos_token_id=self.eos_token_id,
+        )
+
+        return (config, input_ids_1, input_ids_2, lm_labels)
+
+    def set_seed(self):
+        random.seed(self.seed)
+        tf.random.set_seed(self.seed)
+
+    def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+        model = TFTransfoXLModel(config)
+
+        hidden_states_1, mems_1 = model(input_ids_1)
+
+        inputs = {"input_ids": input_ids_2, "mems": mems_1}
+
+        hidden_states_2, mems_2 = model(inputs)
+
+        result = {
+            "hidden_states_1": hidden_states_1.numpy(),
+            "mems_1": [mem.numpy() for mem in mems_1],
+            "hidden_states_2": hidden_states_2.numpy(),
+            "mems_2": [mem.numpy() for mem in mems_2],
+        }
+
+        self.parent.assertListEqual(
+            list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(
+            list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(
+            list(list(mem.shape) for mem in result["mems_1"]),
+            [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+        )
+        self.parent.assertListEqual(
+            list(list(mem.shape) for mem in result["mems_2"]),
+            [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+        )
+
+    def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+        model = TFTransfoXLLMHeadModel(config)
+
+        lm_logits_1, mems_1 = model(input_ids_1)
+
+        inputs = {"input_ids": input_ids_1, "labels": lm_labels}
+        _, mems_1 = model(inputs)
+
+        lm_logits_2, mems_2 = model([input_ids_2, mems_1])
+
+        inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
+
+        _, mems_2 = model(inputs)
+
+        result = {
+            "mems_1": [mem.numpy() for mem in mems_1],
+            "lm_logits_1": lm_logits_1.numpy(),
+            "mems_2": [mem.numpy() for mem in mems_2],
+            "lm_logits_2": lm_logits_2.numpy(),
+        }
+
+        self.parent.assertListEqual(
+            list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.parent.assertListEqual(
+            list(list(mem.shape) for mem in result["mems_1"]),
+            [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+        )
+
+        self.parent.assertListEqual(
+            list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+        self.parent.assertListEqual(
+            list(list(mem.shape) for mem in result["mems_2"]),
+            [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids_1}
+        return config, inputs_dict
+
+
 @require_tf
 class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):

@@ -43,155 +172,8 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
    test_torchscript = False
    test_resize_embeddings = False

-    class TFTransfoXLModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            mem_len=30,
-            clamp_len=15,
-            is_training=True,
-            use_labels=True,
-            vocab_size=99,
-            cutoffs=[10, 50, 80],
-            hidden_size=32,
-            d_embed=32,
-            num_attention_heads=4,
-            d_head=8,
-            d_inner=128,
-            div_val=2,
-            num_hidden_layers=5,
-            scope=None,
-            seed=1,
-            eos_token_id=0,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.mem_len = mem_len
-            self.key_length = seq_length + mem_len
-            self.clamp_len = clamp_len
-            self.is_training = is_training
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.cutoffs = cutoffs
-            self.hidden_size = hidden_size
-            self.d_embed = d_embed
-            self.num_attention_heads = num_attention_heads
-            self.d_head = d_head
-            self.d_inner = d_inner
-            self.div_val = div_val
-            self.num_hidden_layers = num_hidden_layers
-            self.scope = scope
-            self.seed = seed
-            self.eos_token_id = eos_token_id
-
-        def prepare_config_and_inputs(self):
-            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            lm_labels = None
-            if self.use_labels:
-                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            config = TransfoXLConfig(
-                vocab_size=self.vocab_size,
-                mem_len=self.mem_len,
-                clamp_len=self.clamp_len,
-                cutoffs=self.cutoffs,
-                d_model=self.hidden_size,
-                d_embed=self.d_embed,
-                n_head=self.num_attention_heads,
-                d_head=self.d_head,
-                d_inner=self.d_inner,
-                div_val=self.div_val,
-                n_layer=self.num_hidden_layers,
-                eos_token_id=self.eos_token_id,
-            )
-
-            return (config, input_ids_1, input_ids_2, lm_labels)
-
-        def set_seed(self):
-            random.seed(self.seed)
-            tf.random.set_seed(self.seed)
-
-        def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
-            model = TFTransfoXLModel(config)
-
-            hidden_states_1, mems_1 = model(input_ids_1)
-
-            inputs = {"input_ids": input_ids_2, "mems": mems_1}
-
-            hidden_states_2, mems_2 = model(inputs)
-
-            result = {
-                "hidden_states_1": hidden_states_1.numpy(),
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "hidden_states_2": hidden_states_2.numpy(),
-                "mems_2": [mem.numpy() for mem in mems_2],
-            }
-
-            self.parent.assertListEqual(
-                list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
-            model = TFTransfoXLLMHeadModel(config)
-
-            lm_logits_1, mems_1 = model(input_ids_1)
-
-            inputs = {"input_ids": input_ids_1, "labels": lm_labels}
-            _, mems_1 = model(inputs)
-
-            lm_logits_2, mems_2 = model([input_ids_2, mems_1])
-
-            inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
-
-            _, mems_2 = model(inputs)
-
-            result = {
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "lm_logits_1": lm_logits_1.numpy(),
-                "mems_2": [mem.numpy() for mem in mems_2],
-                "lm_logits_2": lm_logits_2.numpy(),
-            }
-
-            self.parent.assertListEqual(
-                list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-            self.parent.assertListEqual(
-                list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids_1}
-            return config, inputs_dict
-
    def setUp(self):
-        self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self)
+        self.model_tester = TFTransfoXLModelTester(self)
        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)

    def test_config(self):