Expose get_config() on ModelTesters (#12812)

* Expose get_config() on ModelTesters * Typo

Expose get_config() on ModelTesters (#12812)
* Expose get_config() on ModelTesters * Typo
c3d9ac76 · Lysandre Debut · GitHub · cabcc751 · c3d9ac76 · c3d9ac76
Unverified Commit c3d9ac76 authored Jul 21, 2021 by Lysandre Debut Committed by GitHub Jul 21, 2021
20 changed files
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -22,6 +22,7 @@ from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
+from transformers import {{cookiecutter.camelcase_modelname}}Config
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -30,7 +31,6 @@ if is_torch_available():
    import torch
    from transformers import (
-        {{cookiecutter.camelcase_modelname}}Config,
        {{cookiecutter.camelcase_modelname}}ForCausalLM,
        {{cookiecutter.camelcase_modelname}}ForMaskedLM,
        {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
@@ -112,7 +112,12 @@ class {{cookiecutter.camelcase_modelname}}ModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = {{cookiecutter.camelcase_modelname}}Config(
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return {{cookiecutter.camelcase_modelname}}Config(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -127,8 +132,6 @@ class {{cookiecutter.camelcase_modelname}}ModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,

--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import AlbertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -29,7 +29,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
-        AlbertConfig,
        AlbertForMaskedLM,
        AlbertForMultipleChoice,
        AlbertForPreTraining,
@@ -90,7 +89,12 @@ class AlbertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = AlbertConfig(
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return AlbertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -105,8 +109,6 @@ class AlbertModelTester:
            num_hidden_groups=self.num_hidden_groups,
        )
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):

--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -21,7 +21,7 @@ import unittest
 import timeout_decorator  # noqa
-from transformers import is_torch_available
+from transformers import BartConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -35,7 +35,6 @@ if is_torch_available():
    from transformers import (
        AutoModelForSequenceClassification,
-        BartConfig,
        BartForCausalLM,
        BartForConditionalGeneration,
        BartForQuestionAnswering,
@@ -78,7 +77,6 @@ def prepare_bart_inputs_dict(
    }
-@require_torch
 class BartModelTester:
    def __init__(
        self,
@@ -127,7 +125,12 @@ class BartModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BartConfig(
+        config = self.get_config()
+        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+    def get_config(self):
+        return BartConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -143,8 +146,6 @@ class BartModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
-        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()

--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import BertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -30,7 +30,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
-        BertConfig,
        BertForMaskedLM,
        BertForMultipleChoice,
        BertForNextSentencePrediction,
@@ -112,7 +111,15 @@ class BertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = BertConfig(
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        """
+        Returns a tiny configuration by default.
+        """
+        return BertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -127,8 +134,6 @@ class BertModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,

--- a/tests/test_modeling_bert_generation.py
+++ b/tests/test_modeling_bert_generation.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import BertGenerationConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -27,7 +27,7 @@ from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, r
 if is_torch_available():
    import torch
-    from transformers import BertGenerationConfig, BertGenerationDecoder, BertGenerationEncoder
+    from transformers import BertGenerationDecoder, BertGenerationEncoder
 class BertGenerationEncoderTester:
@@ -79,7 +79,12 @@ class BertGenerationEncoderTester:
        if self.use_labels:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BertGenerationConfig(
+        config = self.get_config()
+        return config, input_ids, input_mask, token_labels
+    def get_config(self):
+        return BertGenerationConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -93,8 +98,6 @@ class BertGenerationEncoderTester:
            initializer_range=self.initializer_range,
        )
-        return config, input_ids, input_mask, token_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,

--- a/tests/test_modeling_big_bird.py
+++ b/tests/test_modeling_big_bird.py
@@ -18,7 +18,7 @@
 import unittest
 from tests.test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import BigBirdConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -32,7 +32,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
-        BigBirdConfig,
        BigBirdForCausalLM,
        BigBirdForMaskedLM,
        BigBirdForMultipleChoice,
@@ -126,7 +125,12 @@ class BigBirdModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = BigBirdConfig(
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return BigBirdConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -147,8 +151,6 @@ class BigBirdModelTester:
            position_embedding_type=self.position_embedding_type,
        )
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,

--- a/tests/test_modeling_bigbird_pegasus.py
+++ b/tests/test_modeling_bigbird_pegasus.py
@@ -19,7 +19,7 @@ import copy
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import BigBirdPegasusConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -31,7 +31,6 @@ if is_torch_available():
    import torch
    from transformers import (
-        BigBirdPegasusConfig,
        BigBirdPegasusForCausalLM,
        BigBirdPegasusForConditionalGeneration,
        BigBirdPegasusForQuestionAnswering,
@@ -69,7 +68,6 @@ def prepare_bigbird_pegasus_inputs_dict(
    return input_dict
-@require_torch
 class BigBirdPegasusModelTester:
    def __init__(
        self,
@@ -129,7 +127,12 @@ class BigBirdPegasusModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BigBirdPegasusConfig(
+        config = self.get_config()
+        inputs_dict = prepare_bigbird_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+    def get_config(self):
+        return BigBirdPegasusConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -150,8 +153,6 @@ class BigBirdPegasusModelTester:
            num_random_blocks=self.num_random_blocks,
            scale_embedding=self.scale_embedding,
        )
-        inputs_dict = prepare_bigbird_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()

--- a/tests/test_modeling_blenderbot.py
+++ b/tests/test_modeling_blenderbot.py
@@ -17,7 +17,7 @@
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import BlenderbotConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
@@ -29,7 +29,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor
 if is_torch_available():
    import torch
-    from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
+    from transformers import BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
    from transformers.models.blenderbot.modeling_blenderbot import (
        BlenderbotDecoder,
        BlenderbotEncoder,
@@ -68,7 +68,6 @@ def prepare_blenderbot_inputs_dict(
    }
-@require_torch
 class BlenderbotModelTester:
    def __init__(
        self,
@@ -109,7 +108,6 @@ class BlenderbotModelTester:
        self.bos_token_id = bos_token_id
    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
            3,
        )
@@ -117,7 +115,12 @@ class BlenderbotModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BlenderbotConfig(
+        config = self.get_config()
+        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+    def get_config(self):
+        return BlenderbotConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -133,8 +136,6 @@ class BlenderbotModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
-        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()

--- a/tests/test_modeling_blenderbot_small.py
+++ b/tests/test_modeling_blenderbot_small.py
@@ -17,7 +17,7 @@
 import tempfile
 import unittest
-from transformers import is_torch_available
+from transformers import BlenderbotSmallConfig, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -29,12 +29,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor
 if is_torch_available():
    import torch
-    from transformers import (
+    from transformers import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel, BlenderbotSmallTokenizer
-        BlenderbotSmallConfig,
-        BlenderbotSmallForConditionalGeneration,
-        BlenderbotSmallModel,
-        BlenderbotSmallTokenizer,
-    )
    from transformers.models.blenderbot_small.modeling_blenderbot_small import (
        BlenderbotSmallDecoder,
        BlenderbotSmallEncoder,
@@ -73,7 +68,6 @@ def prepare_blenderbot_small_inputs_dict(
    }
-@require_torch
 class BlenderbotSmallModelTester:
    def __init__(
        self,
@@ -114,7 +108,6 @@ class BlenderbotSmallModelTester:
        self.bos_token_id = bos_token_id
    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
            3,
        )
@@ -122,7 +115,12 @@ class BlenderbotSmallModelTester:
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        config = BlenderbotSmallConfig(
+        config = self.get_config()
+        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+    def get_config(self):
+        return BlenderbotSmallConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -138,8 +136,6 @@ class BlenderbotSmallModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
-        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()

--- a/tests/test_modeling_canine.py
+++ b/tests/test_modeling_canine.py
@@ -18,7 +18,7 @@
 import unittest
 from typing import List, Tuple
-from transformers import is_torch_available
+from transformers import CanineConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -29,7 +29,6 @@ if is_torch_available():
    import torch
    from transformers import (
-        CanineConfig,
        CanineForMultipleChoice,
        CanineForQuestionAnswering,
        CanineForSequenceClassification,
@@ -106,7 +105,12 @@ class CanineModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = CanineConfig(
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return CanineConfig(
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
@@ -120,8 +124,6 @@ class CanineModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):

--- a/tests/test_modeling_clip.py
+++ b/tests/test_modeling_clip.py
@@ -21,6 +21,7 @@ import tempfile
 import unittest
 import requests
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -32,7 +33,7 @@ if is_torch_available():
    import torch
    from torch import nn
-    from transformers import CLIPConfig, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPVisionConfig, CLIPVisionModel
+    from transformers import CLIPModel, CLIPTextModel, CLIPVisionModel
    from transformers.models.clip.modeling_clip import CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -77,7 +78,12 @@ class CLIPVisionModelTester:
    def prepare_config_and_inputs(self):
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = CLIPVisionConfig(
+        config = self.get_config()
+        return config, pixel_values
+    def get_config(self):
+        return CLIPVisionConfig(
            image_size=self.image_size,
            patch_size=self.patch_size,
            num_channels=self.num_channels,
@@ -90,8 +96,6 @@ class CLIPVisionModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, pixel_values
    def create_and_check_model(self, config, pixel_values):
        model = CLIPVisionModel(config=config)
        model.to(torch_device)
@@ -323,7 +327,12 @@ class CLIPTextModelTester:
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = CLIPTextConfig(
+        config = self.get_config()
+        return config, input_ids, input_mask
+    def get_config(self):
+        return CLIPTextConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -335,8 +344,6 @@ class CLIPTextModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, input_ids, input_mask
    def create_and_check_model(self, config, input_ids, input_mask):
        model = CLIPTextModel(config=config)
        model.to(torch_device)
@@ -409,10 +416,15 @@ class CLIPModelTester:
        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = CLIPConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
+        config = self.get_config()
        return config, input_ids, attention_mask, pixel_values
+    def get_config(self):
+        return CLIPConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
        model = CLIPModel(config).to(torch_device).eval()
        result = model(input_ids, pixel_values, attention_mask)

--- a/tests/test_modeling_convbert.py
+++ b/tests/test_modeling_convbert.py
@@ -18,7 +18,7 @@
 import unittest
 from tests.test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import ConvBertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -31,7 +31,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        ConvBertConfig,
        ConvBertForMaskedLM,
        ConvBertForMultipleChoice,
        ConvBertForQuestionAnswering,
@@ -110,7 +109,12 @@ class ConvBertModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = ConvBertConfig(
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return ConvBertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -125,8 +129,6 @@ class ConvBertModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,

--- a/tests/test_modeling_ctrl.py
+++ b/tests/test_modeling_ctrl.py
@@ -15,7 +15,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import CTRLConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -28,7 +28,6 @@ if is_torch_available():
    from transformers import (
        CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
-        CTRLConfig,
        CTRLForSequenceClassification,
        CTRLLMHeadModel,
        CTRLModel,
@@ -88,21 +87,7 @@ class CTRLModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = CTRLConfig(
+        config = self.get_config()
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
-            # hidden_act=self.hidden_act,
-            # hidden_dropout_prob=self.hidden_dropout_prob,
-            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            n_ctx=self.max_position_embeddings,
-            # type_vocab_size=self.type_vocab_size,
-            # initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
@@ -118,6 +103,23 @@ class CTRLModelTester:
            choice_labels,
        )
+    def get_config(self):
+        return CTRLConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
    def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
        model = CTRLModel(config=config)
        model.to(torch_device)

--- a/tests/test_modeling_deberta.py
+++ b/tests/test_modeling_deberta.py
@@ -12,10 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from transformers import is_torch_available
+from transformers import DebertaConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -26,7 +25,6 @@ if is_torch_available():
    import torch
    from transformers import (
-        DebertaConfig,
        DebertaForMaskedLM,
        DebertaForQuestionAnswering,
        DebertaForSequenceClassification,
@@ -36,6 +34,179 @@ if is_torch_available():
    from transformers.models.deberta.modeling_deberta import DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
+class DebertaModelTester(object):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        relative_attention=False,
+        position_biased_input=True,
+        pos_att_type="None",
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.relative_attention = relative_attention
+        self.position_biased_input = position_biased_input
+        self.pos_att_type = pos_att_type
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return DebertaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            relative_attention=self.relative_attention,
+            position_biased_input=self.position_biased_input,
+            pos_att_type=self.pos_att_type,
+        )
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result.loss.size()), [])
+    def create_and_check_deberta_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DebertaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
+        sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
+        sequence_output = model(input_ids)[0]
+        self.parent.assertListEqual(list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size])
+    def create_and_check_deberta_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DebertaForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+    def create_and_check_deberta_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = DebertaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
+        self.check_loss_output(result)
+    def create_and_check_deberta_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = DebertaForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+    def create_and_check_deberta_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DebertaForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
 @require_torch
 class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
@@ -56,179 +227,8 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
    test_head_masking = False
    is_encoder_decoder = False
-    class DebertaModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            relative_attention=False,
-            position_biased_input=True,
-            pos_att_type="None",
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.relative_attention = relative_attention
-            self.position_biased_input = position_biased_input
-            self.pos_att_type = pos_att_type
-            self.scope = scope
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-            config = DebertaConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-                relative_attention=self.relative_attention,
-                position_biased_input=self.position_biased_input,
-                pos_att_type=self.pos_att_type,
-            )
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result.loss.size()), [])
-        def create_and_check_deberta_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DebertaModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
-            sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
-            sequence_output = model(input_ids)[0]
-            self.parent.assertListEqual(
-                list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-        def create_and_check_deberta_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DebertaForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        def create_and_check_deberta_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = DebertaForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-            self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-        def create_and_check_deberta_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = DebertaForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-        def create_and_check_deberta_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DebertaForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            result = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
    def setUp(self):
-        self.model_tester = DebertaModelTest.DebertaModelTester(self)
+        self.model_tester = DebertaModelTester(self)
        self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37)
    def test_config(self):

--- a/tests/test_modeling_deberta_v2.py
+++ b/tests/test_modeling_deberta_v2.py
@@ -12,10 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from transformers import is_torch_available
+from transformers import DebertaV2Config, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -26,7 +25,6 @@ if is_torch_available():
    import torch
    from transformers import (
-        DebertaV2Config,
        DebertaV2ForMaskedLM,
        DebertaV2ForQuestionAnswering,
        DebertaV2ForSequenceClassification,
@@ -36,6 +34,179 @@ if is_torch_available():
    from transformers.models.deberta_v2.modeling_deberta_v2 import DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST
+class DebertaV2ModelTester(object):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        relative_attention=False,
+        position_biased_input=True,
+        pos_att_type="None",
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.relative_attention = relative_attention
+        self.position_biased_input = position_biased_input
+        self.pos_att_type = pos_att_type
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return DebertaV2Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            relative_attention=self.relative_attention,
+            position_biased_input=self.position_biased_input,
+            pos_att_type=self.pos_att_type,
+        )
+    def check_loss_output(self, result):
+        self.parent.assertListEqual(list(result.loss.size()), [])
+    def create_and_check_deberta_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DebertaV2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
+        sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
+        sequence_output = model(input_ids)[0]
+        self.parent.assertListEqual(list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size])
+    def create_and_check_deberta_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DebertaV2ForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+    def create_and_check_deberta_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = DebertaV2ForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
+        self.check_loss_output(result)
+    def create_and_check_deberta_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = DebertaV2ForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+    def create_and_check_deberta_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DebertaV2ForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
 @require_torch
 class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
@@ -56,179 +227,8 @@ class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
    test_head_masking = False
    is_encoder_decoder = False
-    class DebertaV2ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            relative_attention=False,
-            position_biased_input=True,
-            pos_att_type="None",
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.relative_attention = relative_attention
-            self.position_biased_input = position_biased_input
-            self.pos_att_type = pos_att_type
-            self.scope = scope
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-            config = DebertaV2Config(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-                relative_attention=self.relative_attention,
-                position_biased_input=self.position_biased_input,
-                pos_att_type=self.pos_att_type,
-            )
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result.loss.size()), [])
-        def create_and_check_deberta_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DebertaV2Model(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
-            sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
-            sequence_output = model(input_ids)[0]
-            self.parent.assertListEqual(
-                list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-        def create_and_check_deberta_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DebertaV2ForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        def create_and_check_deberta_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = DebertaV2ForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-            self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-        def create_and_check_deberta_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = DebertaV2ForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-        def create_and_check_deberta_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DebertaV2ForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            result = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
    def setUp(self):
-        self.model_tester = DebertaV2ModelTest.DebertaV2ModelTester(self)
+        self.model_tester = DebertaV2ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37)
    def test_config(self):

--- a/tests/test_modeling_deit.py
+++ b/tests/test_modeling_deit.py
@@ -18,6 +18,7 @@
 import inspect
 import unittest
+from transformers import DeiTConfig
 from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -31,7 +32,6 @@ if is_torch_available():
    from transformers import (
        MODEL_MAPPING,
-        DeiTConfig,
        DeiTForImageClassification,
        DeiTForImageClassificationWithTeacher,
        DeiTModel,
@@ -92,7 +92,12 @@ class DeiTModelTester:
        if self.use_labels:
            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        config = DeiTConfig(
+        config = self.get_config()
+        return config, pixel_values, labels
+    def get_config(self):
+        return DeiTConfig(
            image_size=self.image_size,
            patch_size=self.patch_size,
            num_channels=self.num_channels,
@@ -107,8 +112,6 @@ class DeiTModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, pixel_values, labels
    def create_and_check_model(self, config, pixel_values, labels):
        model = DeiTModel(config=config)
        model.to(torch_device)

--- a/tests/test_modeling_detr.py
+++ b/tests/test_modeling_detr.py
@@ -19,7 +19,7 @@ import inspect
 import math
 import unittest
-from transformers import is_timm_available, is_vision_available
+from transformers import DetrConfig, is_timm_available, is_vision_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
@@ -31,7 +31,7 @@ from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_te
 if is_timm_available():
    import torch
-    from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrModel
+    from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel
 if is_vision_available():
@@ -40,7 +40,6 @@ if is_vision_available():
    from transformers import DetrFeatureExtractor
-@require_timm
 class DetrModelTester:
    def __init__(
        self,
@@ -102,7 +101,11 @@ class DetrModelTester:
                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
                labels.append(target)
-        config = DetrConfig(
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, labels
+    def get_config(self):
+        return DetrConfig(
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
            decoder_layers=self.num_hidden_layers,
@@ -115,7 +118,6 @@ class DetrModelTester:
            num_queries=self.num_queries,
            num_labels=self.num_labels,
        )
-        return config, pixel_values, pixel_mask, labels
    def prepare_config_and_inputs_for_common(self):
        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()

--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import DistilBertConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -28,7 +28,6 @@ if is_torch_available():
    from transformers import (
        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        DistilBertConfig,
        DistilBertForMaskedLM,
        DistilBertForMultipleChoice,
        DistilBertForQuestionAnswering,
@@ -37,160 +36,162 @@ if is_torch_available():
        DistilBertModel,
    )
-    class DistilBertModelTester(object):
-        def __init__(
+class DistilBertModelTester(object):
-            self,
+    def __init__(
-            parent,
+        self,
-            batch_size=13,
+        parent,
-            seq_length=7,
+        batch_size=13,
-            is_training=True,
+        seq_length=7,
-            use_input_mask=True,
+        is_training=True,
-            use_token_type_ids=False,
+        use_input_mask=True,
-            use_labels=True,
+        use_token_type_ids=False,
-            vocab_size=99,
+        use_labels=True,
-            hidden_size=32,
+        vocab_size=99,
-            num_hidden_layers=5,
+        hidden_size=32,
-            num_attention_heads=4,
+        num_hidden_layers=5,
-            intermediate_size=37,
+        num_attention_heads=4,
-            hidden_act="gelu",
+        intermediate_size=37,
-            hidden_dropout_prob=0.1,
+        hidden_act="gelu",
-            attention_probs_dropout_prob=0.1,
+        hidden_dropout_prob=0.1,
-            max_position_embeddings=512,
+        attention_probs_dropout_prob=0.1,
-            type_vocab_size=16,
+        max_position_embeddings=512,
-            type_sequence_label_size=2,
+        type_vocab_size=16,
-            initializer_range=0.02,
+        type_sequence_label_size=2,
-            num_labels=3,
+        initializer_range=0.02,
-            num_choices=4,
+        num_labels=3,
-            scope=None,
+        num_choices=4,
-        ):
+        scope=None,
-            self.parent = parent
+    ):
-            self.batch_size = batch_size
+        self.parent = parent
-            self.seq_length = seq_length
+        self.batch_size = batch_size
-            self.is_training = is_training
+        self.seq_length = seq_length
-            self.use_input_mask = use_input_mask
+        self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
+        self.use_token_type_ids = use_token_type_ids
-            self.vocab_size = vocab_size
+        self.use_labels = use_labels
-            self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
-            self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
-            self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
-            self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
+        self.hidden_act = hidden_act
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.type_vocab_size = type_vocab_size
+        self.max_position_embeddings = max_position_embeddings
-            self.type_sequence_label_size = type_sequence_label_size
+        self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
+        self.type_sequence_label_size = type_sequence_label_size
-            self.num_labels = num_labels
+        self.initializer_range = initializer_range
-            self.num_choices = num_choices
+        self.num_labels = num_labels
-            self.scope = scope
+        self.num_choices = num_choices
+        self.scope = scope
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_mask = None
-            if self.use_input_mask:
+        input_mask = None
-                input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            sequence_labels = None
-            token_labels = None
+        sequence_labels = None
-            choice_labels = None
+        token_labels = None
-            if self.use_labels:
+        choice_labels = None
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        if self.use_labels:
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-            config = DistilBertConfig(
-                vocab_size=self.vocab_size,
+        config = self.get_config()
-                dim=self.hidden_size,
-                n_layers=self.num_hidden_layers,
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-                n_heads=self.num_attention_heads,
-                hidden_dim=self.intermediate_size,
+    def get_config(self):
-                hidden_act=self.hidden_act,
+        return DistilBertConfig(
-                dropout=self.hidden_dropout_prob,
+            vocab_size=self.vocab_size,
-                attention_dropout=self.attention_probs_dropout_prob,
+            dim=self.hidden_size,
-                max_position_embeddings=self.max_position_embeddings,
+            n_layers=self.num_hidden_layers,
-                initializer_range=self.initializer_range,
+            n_heads=self.num_attention_heads,
-            )
+            hidden_dim=self.intermediate_size,
+            hidden_act=self.hidden_act,
-            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
-        def create_and_check_distilbert_model(
+            max_position_embeddings=self.max_position_embeddings,
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+            initializer_range=self.initializer_range,
-        ):
+        )
-            model = DistilBertModel(config=config)
-            model.to(torch_device)
+    def create_and_check_distilbert_model(
-            model.eval()
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-            result = model(input_ids, input_mask)
+    ):
-            result = model(input_ids)
+        model = DistilBertModel(config=config)
-            self.parent.assertEqual(
+        model.to(torch_device)
-                result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)
+        model.eval()
-            )
+        result = model(input_ids, input_mask)
+        result = model(input_ids)
-        def create_and_check_distilbert_for_masked_lm(
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    def create_and_check_distilbert_for_masked_lm(
-            model = DistilBertForMaskedLM(config=config)
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-            model.to(torch_device)
+    ):
-            model.eval()
+        model = DistilBertForMaskedLM(config=config)
-            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        model.to(torch_device)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        def create_and_check_distilbert_for_question_answering(
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    def create_and_check_distilbert_for_question_answering(
-            model = DistilBertForQuestionAnswering(config=config)
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-            model.to(torch_device)
+    ):
-            model.eval()
+        model = DistilBertForQuestionAnswering(config=config)
-            result = model(
+        model.to(torch_device)
-                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+        model.eval()
-            )
+        result = model(
-            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+            input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
-            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        def create_and_check_distilbert_for_sequence_classification(
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    def create_and_check_distilbert_for_sequence_classification(
-            config.num_labels = self.num_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-            model = DistilBertForSequenceClassification(config)
+    ):
-            model.to(torch_device)
+        config.num_labels = self.num_labels
-            model.eval()
+        model = DistilBertForSequenceClassification(config)
-            result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        model.to(torch_device)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        def create_and_check_distilbert_for_token_classification(
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    def create_and_check_distilbert_for_token_classification(
-            config.num_labels = self.num_labels
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-            model = DistilBertForTokenClassification(config=config)
+    ):
-            model.to(torch_device)
+        config.num_labels = self.num_labels
-            model.eval()
+        model = DistilBertForTokenClassification(config=config)
+        model.to(torch_device)
-            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        model.eval()
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        def create_and_check_distilbert_for_multiple_choice(
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
+    def create_and_check_distilbert_for_multiple_choice(
-            config.num_choices = self.num_choices
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-            model = DistilBertForMultipleChoice(config=config)
+    ):
-            model.to(torch_device)
+        config.num_choices = self.num_choices
-            model.eval()
+        model = DistilBertForMultipleChoice(config=config)
-            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        model.to(torch_device)
-            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        model.eval()
-            result = model(
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-                multiple_choice_inputs_ids,
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-                attention_mask=multiple_choice_input_mask,
+        result = model(
-                labels=choice_labels,
+            multiple_choice_inputs_ids,
-            )
+            attention_mask=multiple_choice_input_mask,
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+            labels=choice_labels,
+        )
-        def prepare_config_and_inputs_for_common(self):
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+    def prepare_config_and_inputs_for_common(self):
-            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        config_and_inputs = self.prepare_config_and_inputs()
-            return config, inputs_dict
+        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
 @require_torch

--- a/tests/test_modeling_dpr.py
+++ b/tests/test_modeling_dpr.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import DPRConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -26,7 +26,7 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention
 if is_torch_available():
    import torch
-    from transformers import DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRReaderTokenizer
+    from transformers import DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRReaderTokenizer
    from transformers.models.dpr.modeling_dpr import (
        DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
        DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -104,7 +104,12 @@ class DPRModelTester:
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-        config = DPRConfig(
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return DPRConfig(
            projection_dim=self.projection_dim,
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
@@ -119,8 +124,6 @@ class DPRModelTester:
            initializer_range=self.initializer_range,
        )
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def create_and_check_context_encoder(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):

--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -16,7 +16,7 @@
 import unittest
-from transformers import is_torch_available
+from transformers import ElectraConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
@@ -29,7 +29,6 @@ if is_torch_available():
    from transformers import (
        MODEL_FOR_PRETRAINING_MAPPING,
-        ElectraConfig,
        ElectraForMaskedLM,
        ElectraForMultipleChoice,
        ElectraForPreTraining,
@@ -89,7 +88,21 @@ class ElectraModelTester:
            choice_labels = ids_tensor([self.batch_size], self.num_choices)
            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
-        config = ElectraConfig(
+        config = self.get_config()
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        )
+    def get_config(self):
+        return ElectraConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -104,17 +117,6 @@ class ElectraModelTester:
            initializer_range=self.initializer_range,
        )
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        )
    def create_and_check_electra_model(
        self,
        config,