Merge branch 'master' of github.com:huggingface/transformers

269c9638 · Sylvain Gugger · d31c7b10 · c2e0fd52 · 269c9638 · 269c9638
Commit 269c9638 authored Apr 08, 2021 by Sylvain Gugger
20 changed files
--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -60,6 +60,12 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
    Args:
        requirement (:obj:`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
        hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met
+
+    Example::
+
+       require_version("pandas>1.1.2")
+       require_version("numpy>1.18.5", "this is important to have for whatever reason")
+
    """

    hint = f"\n{hint}" if hint is not None else ""

--- a/examples/tests/deepspeed/ds_config_zero2.json
+++ b/examples/tests/deepspeed/ds_config_zero2.json
--- a/examples/tests/deepspeed/ds_config_zero3.json
+++ b/examples/tests/deepspeed/ds_config_zero3.json
--- a/examples/tests/deepspeed/test_deepspeed.py
+++ b/examples/tests/deepspeed/test_deepspeed.py
@@ -16,16 +16,16 @@ import dataclasses
 import io
 import json
 import os
-import sys
 import unittest
 from copy import deepcopy

 from parameterized import parameterized
-from transformers import TrainingArguments
+from transformers import TrainingArguments, is_torch_available
 from transformers.file_utils import WEIGHTS_NAME
 from transformers.integrations import is_deepspeed_available
 from transformers.testing_utils import (
    CaptureLogger,
+    ExtendSysPath,
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
@@ -38,8 +38,11 @@ from transformers.trainer_utils import set_seed


 bindir = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(f"{bindir}/../../../tests")
-from test_trainer import TrainerIntegrationCommon, get_regression_trainer  # noqa
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+
+    if is_torch_available():
+        from test_trainer import get_regression_trainer  # noqa


 set_seed(42)

--- a/examples/tests/trainer/test_trainer_ext.py
+++ b/examples/tests/trainer/test_trainer_ext.py
@@ -21,6 +21,7 @@ from unittest.mock import patch
 from transformers.file_utils import is_apex_available
 from transformers.integrations import is_fairscale_available
 from transformers.testing_utils import (
+    ExtendSysPath,
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
@@ -34,8 +35,8 @@ from transformers.trainer_utils import set_seed


 bindir = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(f"{bindir}/../../seq2seq")
-from run_translation import main  # noqa
+with ExtendSysPath(f"{bindir}/../../examples/seq2seq"):
+    from run_translation import main  # noqa


 set_seed(42)

--- a/tests/sagemaker/README.md
+++ b/tests/sagemaker/README.md
@@ -136,10 +136,7 @@ images:
    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, 
      *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
 ```
-2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). 
-
-TODO: Add a screenshot of PR + Text template to make it easy to open. 
-
+2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1016), which information are needed.

 ## Current Tests


--- a/tests/test_data_collator.py
+++ b/tests/test_data_collator.py
@@ -146,11 +146,8 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
        self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)

-    def test_data_collator_for_language_modeling(self):
+    def _test_no_pad_and_pad(self, no_pad_features, pad_features):
        tokenizer = BertTokenizer(self.vocab_file)
-        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
-        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
-
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        batch = data_collator(no_pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
@@ -160,6 +157,15 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

+        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
        tokenizer._pad_token = None
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        with self.assertRaises(ValueError):
@@ -185,6 +191,32 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertTrue(torch.any(masked_tokens))
        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))

+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+    def test_data_collator_for_language_modeling(self):
+        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
+        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
+        no_pad_features = [list(range(10)), list(range(10))]
+        pad_features = [list(range(5)), list(range(10))]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
    def test_plm(self):
        tokenizer = BertTokenizer(self.vocab_file)
        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
@@ -225,6 +257,14 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))

+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
+
    def test_sop(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [
@@ -242,3 +282,11 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -17,6 +17,7 @@
 import unittest

 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -234,7 +235,7 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

        if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                )

--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+import copy
+import tempfile
 import unittest

 from transformers import is_torch_available
@@ -46,6 +47,8 @@ if is_torch_available():
        BertForSequenceClassification,
        BertForTokenClassification,
        BertModel,
+        FunnelBaseModel,
+        FunnelModel,
        GPT2Config,
        GPT2LMHeadModel,
        RobertaForMaskedLM,
@@ -218,6 +221,21 @@ class AutoModelTest(unittest.TestCase):
        self.assertEqual(model.num_parameters(), 14410)
        self.assertEqual(model.num_parameters(only_trainable=True), 14410)

+    def test_from_pretrained_with_tuple_values(self):
+        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
+        model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
+        self.assertIsInstance(model, FunnelModel)
+
+        config = copy.deepcopy(model.config)
+        config.architectures = ["FunnelBaseModel"]
+        model = AutoModel.from_config(config)
+        self.assertIsInstance(model, FunnelBaseModel)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            model = AutoModel.from_pretrained(tmp_dir)
+            self.assertIsInstance(model, FunnelBaseModel)
+
    def test_parents_and_children_in_mappings(self):
        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
        # by the parents and will return the wrong configuration type when using auto models
@@ -242,6 +260,12 @@ class AutoModelTest(unittest.TestCase):
                    assert not issubclass(
                        child_config, parent_config
                    ), f"{child_config.__name__} is child of {parent_config.__name__}"
-                    assert not issubclass(
-                        child_model, parent_model
-                    ), f"{child_config.__name__} is child of {parent_config.__name__}"
+
+                    # Tuplify child_model and parent_model since some of them could be tuples.
+                    if not isinstance(child_model, (list, tuple)):
+                        child_model = (child_model,)
+                    if not isinstance(parent_model, (list, tuple)):
+                        parent_model = (parent_model,)
+
+                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
+                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -17,6 +17,7 @@
 import unittest

 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -444,7 +445,7 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

        if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                )

--- a/tests/test_modeling_big_bird.py
+++ b/tests/test_modeling_big_bird.py
@@ -19,6 +19,7 @@ import unittest

 from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
 from transformers.testing_utils import require_torch, slow, torch_device

@@ -458,7 +459,7 @@ class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

        if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                )

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -24,6 +24,7 @@ from typing import List, Tuple

 from transformers import is_torch_available
 from transformers.file_utils import WEIGHTS_NAME
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device


@@ -79,7 +80,7 @@ class ModelTesterMixin:

    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
        inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
            inputs_dict = {
                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
                if isinstance(v, torch.Tensor) and v.ndim > 1
@@ -88,9 +89,9 @@ class ModelTesterMixin:
            }

        if return_labels:
-            if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                inputs_dict["start_positions"] = torch.zeros(
                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                )
@@ -98,18 +99,18 @@ class ModelTesterMixin:
                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                )
            elif model_class in [
-                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
-                *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.values(),
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
            ]:
                inputs_dict["labels"] = torch.zeros(
                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                )
            elif model_class in [
-                *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_CAUSAL_LM_MAPPING.values(),
-                *MODEL_FOR_MASKED_LM_MAPPING.values(),
-                *MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
            ]:
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
@@ -229,7 +230,7 @@ class ModelTesterMixin:
        config.return_dict = True

        for model_class in self.all_model_classes:
-            if model_class in MODEL_MAPPING.values():
+            if model_class in get_values(MODEL_MAPPING):
                continue
            model = model_class(config)
            model.to(torch_device)
@@ -248,7 +249,7 @@ class ModelTesterMixin:
        config.return_dict = True

        for model_class in self.all_model_classes:
-            if model_class in MODEL_MAPPING.values():
+            if model_class in get_values(MODEL_MAPPING):
                continue
            model = model_class(config)
            model.to(torch_device)
@@ -312,7 +313,7 @@ class ModelTesterMixin:
                if "labels" in inputs_dict:
                    correct_outlen += 1  # loss is added to beginning
                # Question Answering model returns start_logits and end_logits
-                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                if "past_key_values" in outputs:
                    correct_outlen += 1  # past_key_values have been returned

--- a/tests/test_modeling_convbert.py
+++ b/tests/test_modeling_convbert.py
@@ -19,6 +19,7 @@ import unittest

 from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -352,7 +353,7 @@ class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
                if "labels" in inputs_dict:
                    correct_outlen += 1  # loss is added to beginning
                # Question Answering model returns start_logits and end_logits
-                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                if "past_key_values" in outputs:
                    correct_outlen += 1  # past_key_values have been returned

--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -17,6 +17,7 @@
 import unittest

 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -292,7 +293,7 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

        if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                )

--- a/tests/test_modeling_flax_bert.py
+++ b/tests/test_modeling_flax_bert.py
@@ -29,6 +29,7 @@ if is_flax_available():
        FlaxBertForNextSentencePrediction,
        FlaxBertForPreTraining,
        FlaxBertForQuestionAnswering,
+        FlaxBertForSequenceClassification,
        FlaxBertForTokenClassification,
        FlaxBertModel,
    )
@@ -125,6 +126,7 @@ class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
            FlaxBertForMultipleChoice,
            FlaxBertForQuestionAnswering,
            FlaxBertForNextSentencePrediction,
+            FlaxBertForSequenceClassification,
            FlaxBertForTokenClassification,
            FlaxBertForQuestionAnswering,
        )

--- a/tests/test_modeling_funnel.py
+++ b/tests/test_modeling_funnel.py
@@ -17,6 +17,7 @@
 import unittest

 from transformers import FunnelTokenizer, is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -365,7 +366,7 @@ class FunnelModelTest(ModelTesterMixin, unittest.TestCase):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

        if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                )

--- a/tests/test_modeling_led.py
+++ b/tests/test_modeling_led.py
@@ -21,6 +21,7 @@ import unittest

 from transformers import is_torch_available
 from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -412,7 +413,7 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
            if "labels" in inputs_dict:
                correct_outlen += 1  # loss is added to beginning
            # Question Answering model returns start_logits and end_logits
-            if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                correct_outlen += 1  # start_logits and end_logits instead of only 1 output
            if "past_key_values" in outputs:
                correct_outlen += 1  # past_key_values have been returned

--- a/tests/test_modeling_lxmert.py
+++ b/tests/test_modeling_lxmert.py
@@ -18,6 +18,7 @@ import copy
 import unittest

 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -532,11 +533,11 @@ class LxmertModelTest(ModelTesterMixin, unittest.TestCase):
        inputs_dict = copy.deepcopy(inputs_dict)

        if return_labels:
-            if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                )
-            elif model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                # special case for models like BERT that use multi-loss training for PreTraining
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device

--- a/tests/test_modeling_megatron_bert.py
+++ b/tests/test_modeling_megatron_bert.py
@@ -21,6 +21,7 @@ import os
 import unittest

 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -291,7 +292,7 @@ class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

        if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                )

--- a/tests/test_modeling_mobilebert.py
+++ b/tests/test_modeling_mobilebert.py
@@ -17,6 +17,7 @@
 import unittest

 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device

 from .test_configuration_common import ConfigTester
@@ -272,7 +273,7 @@ class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

        if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                inputs_dict["labels"] = torch.zeros(
                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                )