Skip tests properly (#31308)

* Skip tests properly * [test_all] * Add 'reason' as kwarg for skipTest * [test_all] Fix up * [test_all]

Skip tests properly (#31308)
* Skip tests properly * [test_all] * Add 'reason' as kwarg for skipTest * [test_all] Fix up * [test_all]
1de7dc74 · amyeroberts · GitHub · 1f9f57ab · 1de7dc74 · 1de7dc74
Unverified Commit 1de7dc74 authored Jun 26, 2024 by amyeroberts Committed by GitHub Jun 26, 2024
14 changed files
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -496,7 +496,7 @@ class Bnb4BitTestTraining(Base4bitTest):

    def test_training(self):
        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
-            return
+            self.skipTest(reason="This test requires bitsandbytes >= 0.37.0")

        # Step 1: freeze all parameters
        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)

--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -826,7 +826,7 @@ class MixedInt8TestTraining(BaseMixedInt8Test):

    def test_training(self):
        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
-            return
+            self.skipTest(reason="This test requires bitsandbytes>=0.37.0")

        # Step 1: freeze all parameters
        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)

--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -332,20 +332,23 @@ class QuantoQuantizationOffloadTest(QuantoQuantizationTest):
        "lm_head": 0,
    }

-    # the execution device is a gpu
+    @unittest.skip(reason="The execution device is a gpu")
    def test_generate_quality_cpu(self):
        pass

-    # we can't save offloaded values
+    @unittest.skip(reason="We can't save offloaded values")
    def test_serialization_bin(self):
        pass

+    @unittest.skip
    def test_serialization_safetensors(self):
        pass

+    @unittest.skip
    def test_compare_with_quanto(self):
        pass

+    @unittest.skip
    def test_load_from_quanto_saved(self):
        pass

@@ -370,7 +373,7 @@ class QuantoQuantizationOffloadTest(QuantoQuantizationTest):
            )


-@unittest.skip("Skipping test class because serialization is not supported yet")
+@unittest.skip(reason="Skipping test class because serialization is not supported yet")
 class QuantoQuantizationSerializationTest(QuantoQuantizationTest):
    """
    Perform the same tests as in QuantoQuantizationTest but with a serialized model.
@@ -403,7 +406,7 @@ class QuantoQuantizationSerializationTest(QuantoQuantizationTest):
        )


-@unittest.skip("Skipping test class because serialization is not supported yet")
+@unittest.skip(reason="Skipping test class because serialization is not supported yet")
 class QuantoQuantizationSerializationCudaTest(QuantoQuantizationTest):
    """
    Perform the same tests as in QuantoQuantizationTest but with model on cuda
@@ -422,7 +425,7 @@ class QuantoQuantizationQBitsTensorOffloadTest(QuantoQuantizationOffloadTest):
    weights = "int4"


-@unittest.skip("Skipping test class because serialization is not supported yet")
+@unittest.skip(reason="Skipping test class because serialization is not supported yet")
 class QuantoQuantizationQBitsTensorSerializationTest(QuantoQuantizationSerializationTest):
    EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
    weights = "int4"

--- a/tests/test_cache_utils.py
+++ b/tests/test_cache_utils.py
@@ -452,6 +452,6 @@ class CacheIntegrationTest(unittest.TestCase):
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)

-    @unittest.skip("TODO @gante static cache's does not support beam search yet")
+    @unittest.skip(reason="TODO @gante static cache's does not support beam search yet")
    def test_static_cache_beam_search(self):
        pass
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -161,10 +161,10 @@ class ImageProcessingTestMixin:
        )

        if not self.test_slow_image_processor or not self.test_fast_image_processor:
-            self.skipTest("Skipping slow/fast equivalence test")
+            self.skipTest(reason="Skipping slow/fast equivalence test")

        if self.image_processing_class is None or self.fast_image_processing_class is None:
-            self.skipTest("Skipping slow/fast equivalence test as one of the image processors is not defined")
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")

        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
@@ -178,10 +178,10 @@ class ImageProcessingTestMixin:
    @require_torch
    def test_fast_is_faster_than_slow(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
-            self.skipTest("Skipping speed test")
+            self.skipTest(reason="Skipping speed test")

        if self.image_processing_class is None or self.fast_image_processing_class is None:
-            self.skipTest("Skipping speed test as one of the image processors is not defined")
+            self.skipTest(reason="Skipping speed test as one of the image processors is not defined")

        def measure_time(image_processor, image):
            start = time.time()

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -298,7 +298,7 @@ class ModelTesterMixin:
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
            if model_class._keep_in_fp32_modules is None:
-                return
+                self.skipTest(reason="Model class has no _keep_in_fp32_modules attribute defined")

            model = model_class(config)
            with tempfile.TemporaryDirectory() as tmpdirname:
@@ -392,7 +392,8 @@ class ModelTesterMixin:
    def test_save_load_fast_init_from_base(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        if config.__class__ not in MODEL_MAPPING:
-            return
+            self.skipTest(reason="Model class not in MODEL_MAPPING")
+
        base_class = MODEL_MAPPING[config.__class__]

        if isinstance(base_class, tuple):
@@ -522,94 +523,11 @@ class ModelTesterMixin:

        self.assertEqual(tied_params1, tied_params2)

-    def test_fast_init_context_manager(self):
-        # 1. Create a dummy class. Should have buffers as well? To make sure we test __init__
-        class MyClass(PreTrainedModel):
-            config_class = PretrainedConfig
-
-            def __init__(self, config=None):
-                super().__init__(config if config is not None else PretrainedConfig())
-                self.linear = nn.Linear(10, 10, bias=True)
-                self.embedding = nn.Embedding(10, 10)
-                self.std = 1
-
-            def _init_weights(self, module):
-                if isinstance(module, nn.Linear):
-                    module.weight.data = nn.init.kaiming_uniform_(module.weight.data, np.sqrt(5))
-                    if module.bias is not None:
-                        module.bias.data.normal_(mean=0.0, std=self.std)
-
-        # 2. Make sure a linear layer's reset params is properly skipped:
-        with ContextManagers([no_init_weights(True)]):
-            no_init_instance = MyClass()
-
-        set_seed(0)
-        expected_bias = torch.tensor(
-            ([0.2975, 0.2131, -0.1379, -0.0796, -0.3012, -0.0057, -0.2381, -0.2439, -0.0174, 0.0475])
-        )
-        init_instance = MyClass()
-        torch.testing.assert_close(init_instance.linear.bias, expected_bias, rtol=1e-3, atol=1e-4)
-
-        set_seed(0)
-        torch.testing.assert_close(
-            init_instance.linear.weight, nn.init.kaiming_uniform_(no_init_instance.linear.weight, np.sqrt(5))
-        )
-
-        # 3. Make sure weights that are not present use init_weight_ and get expected values
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            state_dict = init_instance.state_dict()
-            del state_dict["linear.weight"]
-
-            init_instance.config.save_pretrained(tmpdirname)
-            torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-            set_seed(0)
-            model_fast_init = MyClass.from_pretrained(tmpdirname)
-
-            set_seed(0)
-            model_slow_init = MyClass.from_pretrained(tmpdirname, _fast_init=False)
-
-            for key in model_fast_init.state_dict().keys():
-                max_diff = torch.max(torch.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]))
-                self.assertLessEqual(max_diff.item(), 1e-3, msg=f"{key} not identical")
-
-    def test_fast_init_tied_embeddings(self):
-        class MyClass(PreTrainedModel):
-            config_class = PretrainedConfig
-            _tied_weights_keys = ["output_embeddings.weight"]
-
-            def __init__(self, config=None):
-                super().__init__(config if config is not None else PretrainedConfig())
-                self.input_embeddings = nn.Embedding(10, 10)
-                self.output_embeddings = nn.Linear(10, 10, bias=False)
-                self.tie_weights()
-
-            def get_output_embeddings(self):
-                return self.output_embeddings
-
-            def set_output_embeddings(self, output_embeddings):
-                self.output_embeddings = output_embeddings
-
-            def get_input_embeddings(self):
-                return self.input_embeddings
-
-            def set_input_embeddings(self, input_embeddings):
-                self.input_embeddings = input_embeddings
-
-            def _init_weights(self, module):
-                if module is self.output_embeddings:
-                    raise ValueError("unnecessarily initialized tied output embedding!")
-
-        model = MyClass()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            # throws if it initializes the tied output_embeddings
-            MyClass.from_pretrained(tmpdirname)
-
    def test_save_load_fast_init_to_base(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        if config.__class__ not in MODEL_MAPPING:
-            return
+            self.skipTest(reason="Model class not in MODEL_MAPPING")
+
        base_class = MODEL_MAPPING[config.__class__]

        if isinstance(base_class, tuple):
@@ -664,7 +582,8 @@ class ModelTesterMixin:
    def test_torch_save_load(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        if config.__class__ not in MODEL_MAPPING:
-            return
+            self.skipTest(reason="Model class not in MODEL_MAPPING")
+
        base_class = MODEL_MAPPING[config.__class__]

        if isinstance(base_class, tuple):
@@ -748,38 +667,6 @@ class ModelTesterMixin:
            else:
                check_determinism(first, second)

-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            elif model_class.__name__ in [*get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] and self.has_attentions:
-                expected_arg_names = ["pixel_values", "output_hidden_states", "output_attentions", "return_dict"]
-                self.assertListEqual(arg_names, expected_arg_names)
-            elif model_class.__name__ in [*get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] and not self.has_attentions:
-                expected_arg_names = ["pixel_values", "output_hidden_states", "return_dict"]
-                self.assertListEqual(arg_names, expected_arg_names)
-            else:
-                expected_arg_names = [model.main_input_name]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
    def test_batching_equivalence(self):
        """
        Tests that the model supports batching and that the output is the nearly the same for the same input in
@@ -875,7 +762,7 @@ class ModelTesterMixin:

    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
        if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not configured to run training tests")

        for model_class in self.all_model_classes:
            if (
@@ -914,7 +801,7 @@ class ModelTesterMixin:

    def test_training(self):
        if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not configured to run training tests")

        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1095,7 +982,7 @@ class ModelTesterMixin:

    def _create_and_check_torchscript(self, config, inputs_dict):
        if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to `False`")

        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
        configs_no_init.torchscript = True
@@ -1157,7 +1044,7 @@ class ModelTesterMixin:
                            if "attention_mask" in inputs:
                                trace_input["attention_mask"] = inputs["attention_mask"]
                            else:
-                                self.skipTest("testing SDPA without attention_mask is not supported")
+                                self.skipTest(reason="testing SDPA without attention_mask is not supported")

                            model(main_input, attention_mask=inputs["attention_mask"])
                            # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
@@ -1369,7 +1256,7 @@ class ModelTesterMixin:

    def test_headmasking(self):
        if not self.test_head_masking:
-            return
+            self.skipTest(reason="Model does not support head masking")

        global_rng.seed(42)
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1439,7 +1326,7 @@ class ModelTesterMixin:

    def test_head_pruning(self):
        if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")

        for model_class in self.all_model_classes:
            (
@@ -1472,7 +1359,7 @@ class ModelTesterMixin:

    def test_head_pruning_save_load_from_pretrained(self):
        if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")

        for model_class in self.all_model_classes:
            (
@@ -1509,7 +1396,7 @@ class ModelTesterMixin:

    def test_head_pruning_save_load_from_config_init(self):
        if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")

        for model_class in self.all_model_classes:
            (
@@ -1544,7 +1431,7 @@ class ModelTesterMixin:

    def test_head_pruning_integration(self):
        if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")

        for model_class in self.all_model_classes:
            (
@@ -1733,7 +1620,7 @@ class ModelTesterMixin:

    def test_resize_position_vector_embeddings(self):
        if not self.test_resize_position_embeddings:
-            return
+            self.skipTest(reason="Model does not have position embeddings")

        (
            original_config,
@@ -1816,7 +1703,7 @@ class ModelTesterMixin:
            inputs_dict,
        ) = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to `False`")

        for model_class in self.all_model_classes:
            config = copy.deepcopy(original_config)
@@ -1916,13 +1803,13 @@ class ModelTesterMixin:
            inputs_dict,
        ) = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to `False`")

        original_config.tie_word_embeddings = False

        # if model cannot untied embeddings -> leave test
        if original_config.tie_word_embeddings:
-            return
+            self.skipTest(reason="Model cannot untied embeddings")

        for model_class in self.all_model_classes:
            config = copy.deepcopy(original_config)
@@ -1994,7 +1881,7 @@ class ModelTesterMixin:

    def test_correct_missing_keys(self):
        if not self.test_missing_keys:
-            return
+            self.skipTest(reason="test_missing_keys is set to `False`")
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
@@ -2022,7 +1909,7 @@ class ModelTesterMixin:

    def test_tie_model_weights(self):
        if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to `False`")

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

@@ -2481,8 +2368,7 @@ class ModelTesterMixin:

            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
            if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have this model in TF version yet
-                return
+                self.skipTest(reason="transformers does not have TF version of this model yet")

            # Output all for aggressive testing
            config.output_hidden_states = True
@@ -2664,8 +2550,7 @@ class ModelTesterMixin:
                fx_model_class_name = "Flax" + model_class.__name__

                if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")

                # Output all for aggressive testing
                config.output_hidden_states = True
@@ -2736,8 +2621,7 @@ class ModelTesterMixin:
                fx_model_class_name = "Flax" + model_class.__name__

                if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")

                # Output all for aggressive testing
                config.output_hidden_states = True
@@ -2849,7 +2733,7 @@ class ModelTesterMixin:

            model_forward_args = inspect.signature(model.forward).parameters
            if "inputs_embeds" not in model_forward_args:
-                self.skipTest("This model doesn't use `inputs_embeds`")
+                self.skipTest(reason="This model doesn't use `inputs_embeds`")

            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
@@ -2910,7 +2794,7 @@ class ModelTesterMixin:
    @require_torch_multi_gpu
    def test_model_parallelization(self):
        if not self.test_model_parallel:
-            return
+            self.skipTest(reason="test_model_parallel is set to False")

        # a candidate for testing_utils
        def get_current_gpu_memory_use():
@@ -2972,7 +2856,7 @@ class ModelTesterMixin:
    @require_torch_multi_gpu
    def test_model_parallel_equal_results(self):
        if not self.test_model_parallel:
-            return
+            self.skipTest(reason="test_model_parallel is set to False")

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

@@ -3221,7 +3105,7 @@ class ModelTesterMixin:

    def test_load_with_mismatched_shapes(self):
        if not self.test_mismatched_shapes:
-            return
+            self.skipTest(reason="test_missmatched_shapes is set to False")
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
@@ -3265,7 +3149,7 @@ class ModelTesterMixin:

    def test_mismatched_shapes_have_properly_initialized_weights(self):
        if not self.test_mismatched_shapes:
-            return
+            self.skipTest(reason="test_missmatched_shapes is set to False")
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        configs_no_init = _config_zero_init(config)
@@ -3383,6 +3267,9 @@ class ModelTesterMixin:
    @mark.flash_attn_test
    @slow
    def test_flash_attn_2_conversion(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
@@ -3409,6 +3296,9 @@ class ModelTesterMixin:
    @slow
    @is_flaky()
    def test_flash_attn_2_inference_equivalence(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        for model_class in self.all_model_classes:
            if not model_class._supports_flash_attn_2:
                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3503,6 +3393,9 @@ class ModelTesterMixin:
    @slow
    @is_flaky()
    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        for model_class in self.all_model_classes:
            if not model_class._supports_flash_attn_2:
                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3593,6 +3486,9 @@ class ModelTesterMixin:
    @slow
    @is_flaky()
    def test_flash_attn_2_generate_left_padding(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        for model_class in self.all_generative_model_classes:
            if not model_class._supports_flash_attn_2:
                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3638,6 +3534,9 @@ class ModelTesterMixin:
    @is_flaky()
    @slow
    def test_flash_attn_2_generate_padding_right(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        for model_class in self.all_generative_model_classes:
            if not model_class._supports_flash_attn_2:
                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3681,6 +3580,9 @@ class ModelTesterMixin:
    @require_torch_sdpa
    @slow
    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        if not self.all_model_classes[0]._supports_sdpa:
            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")

@@ -3979,11 +3881,14 @@ class ModelTesterMixin:
    @require_torch_gpu
    @slow
    def test_sdpa_can_dispatch_on_flash(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        compute_capability = torch.cuda.get_device_capability()
        major, _ = compute_capability

        if not torch.version.cuda or major < 8:
-            self.skipTest("This test requires an NVIDIA GPU with compute capability >= 8.0")
+            self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")

        for model_class in self.all_model_classes:
            if not model_class._supports_sdpa:
@@ -3992,13 +3897,15 @@ class ModelTesterMixin:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
            if config.model_type in ["llava", "llava_next", "vipllava", "video_llava"]:
-                self.skipTest("Llava-like models currently (transformers==4.39.1) requires an attention_mask input")
+                self.skipTest(
+                    reason="Llava-like models currently (transformers==4.39.1) requires an attention_mask input"
+                )
            if config.model_type in ["paligemma"]:
                self.skipTest(
                    "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
                )
            if config.model_type in ["idefics"]:
-                self.skipTest("Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
+                self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
            model = model_class(config)

            with tempfile.TemporaryDirectory() as tmpdirname:
@@ -4020,11 +3927,14 @@ class ModelTesterMixin:
    @require_torch_gpu
    @slow
    def test_sdpa_can_compile_dynamic(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        compute_capability = torch.cuda.get_device_capability()
        major, _ = compute_capability

        if not torch.version.cuda or major < 8:
-            self.skipTest("This test requires an NVIDIA GPU with compute capability >= 8.0")
+            self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")

        for model_class in self.all_model_classes:
            if not model_class._supports_sdpa:
@@ -4060,6 +3970,9 @@ class ModelTesterMixin:
    @require_torch_sdpa
    @slow
    def test_eager_matches_sdpa_generate(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        max_new_tokens = 30

        if len(self.all_generative_model_classes) == 0:
@@ -4130,6 +4043,9 @@ class ModelTesterMixin:

    @require_torch_sdpa
    def test_sdpa_matches_eager_sliding_window(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        WINDOW_ATTENTION_MODELS = ["mistral", "mixtral", "qwen2", "qwen_moe", "starcoder2"]

        if len(self.all_generative_model_classes) == 0:
@@ -4184,6 +4100,9 @@ class ModelTesterMixin:
    @mark.flash_attn_test
    @slow
    def test_flash_attn_2_generate_use_cache(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        max_new_tokens = 30

        for model_class in self.all_generative_model_classes:
@@ -4229,6 +4148,9 @@ class ModelTesterMixin:
    @mark.flash_attn_test
    @slow
    def test_flash_attn_2_fp32_ln(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        for model_class in self.all_generative_model_classes:
            if not model_class._supports_flash_attn_2:
                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -4284,8 +4206,7 @@ class ModelTesterMixin:

            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
            if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have this model in TF version yet
-                return
+                self.skipTest(reason="transformers does not have this model in TF version yet")

            tf_model_class = getattr(transformers, tf_model_class_name)

@@ -4309,8 +4230,7 @@ class ModelTesterMixin:

            flax_model_class_name = "Flax" + model_class.__name__  # Add the "Flax at the beginning
            if not hasattr(transformers, flax_model_class_name):
-                # transformers does not have this model in Flax version yet
-                return
+                self.skipTest(reason="transformers does not have this model in Flax version yet")

            flax_model_class = getattr(transformers, flax_model_class_name)

@@ -4331,6 +4251,9 @@ class ModelTesterMixin:
    @mark.flash_attn_test
    @slow
    def test_flash_attn_2_from_config(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        for model_class in self.all_generative_model_classes:
            if not model_class._supports_flash_attn_2:
                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -4407,8 +4330,13 @@ class ModelTesterMixin:
        return input_ids, position_ids, input_ids_shared_prefix, mask_shared_prefix, position_ids_shared_prefix

    def test_custom_4d_attention_mask(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
        if len(self.all_generative_model_classes) == 0:
-            self.skipTest("Model architecture has no generative classes, and thus not necessarily supporting 4D masks")
+            self.skipTest(
+                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
+            )

        for model_class in self.all_generative_model_classes:
            if not model_class._supports_static_cache:
@@ -4453,7 +4381,7 @@ class ModelTesterMixin:
    @require_read_token
    def test_torch_compile(self):
        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest("This test requires torch >= 2.3 to run.")
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")

        if not hasattr(self, "_torch_compile_test_ckpt"):
            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")

--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -1354,7 +1354,7 @@ class ModelUtilsTest(TestCasePlus):
            self.assertIn("You may ignore this warning if your `pad_token_id`", cl.out)

        if not is_torchdynamo_available():
-            return
+            self.skipTest(reason="torchdynamo is not available")
        with self.subTest("Ensure that the warning code is skipped when compiling with torchdynamo."):
            logger.warning_once.cache_clear()
            from torch._dynamo import config, testing
@@ -1631,7 +1631,7 @@ class ModelOnTheFlyConversionTester(unittest.TestCase):
            self.assertEqual(discussion.author, "SFconvertbot")
            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")

-    @unittest.skip("Edge case, should work once the Space is updated`")
+    @unittest.skip(reason="Edge case, should work once the Space is updated`")
    def test_safetensors_on_the_fly_wrong_user_opened_pr(self):
        config = BertConfig(
            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
@@ -1760,7 +1760,7 @@ class ModelPushToHubTester(unittest.TestCase):
        except HTTPError:
            pass

-    @unittest.skip("This test is flaky")
+    @unittest.skip(reason="This test is flaky")
    def test_push_to_hub(self):
        config = BertConfig(
            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
@@ -1800,7 +1800,7 @@ The commit description supports markdown synthax see:
        )
        self.assertEqual(commit_details.commit_description, COMMIT_DESCRIPTION)

-    @unittest.skip("This test is flaky")
+    @unittest.skip(reason="This test is flaky")
    def test_push_to_hub_in_organization(self):
        config = BertConfig(
            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
@@ -2197,7 +2197,7 @@ class TestAttentionImplementation(unittest.TestCase):

    def test_not_available_flash(self):
        if is_flash_attn_2_available():
-            self.skipTest("Please uninstall flash-attn package to run test_not_available_flash")
+            self.skipTest(reason="Please uninstall flash-attn package to run test_not_available_flash")

        with self.assertRaises(ImportError) as cm:
            _ = AutoModel.from_pretrained(
@@ -2208,7 +2208,7 @@ class TestAttentionImplementation(unittest.TestCase):

    def test_not_available_flash_with_config(self):
        if is_flash_attn_2_available():
-            self.skipTest("Please uninstall flash-attn package to run test_not_available_flash")
+            self.skipTest(reason="Please uninstall flash-attn package to run test_not_available_flash")

        config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-GPTBigCodeModel")

@@ -2223,7 +2223,7 @@ class TestAttentionImplementation(unittest.TestCase):

    def test_not_available_sdpa(self):
        if is_torch_sdpa_available():
-            self.skipTest("This test requires torch<=2.0")
+            self.skipTest(reason="This test requires torch<=2.0")

        with self.assertRaises(ImportError) as cm:
            _ = AutoModel.from_pretrained(

--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -248,7 +248,7 @@ class PipelineTesterMixin:
                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not load the "
                    f"processor from `{repo_id}` with `{processor_name}`."
                )
-                return
+                self.skipTest(f"Could not load the processor from {repo_id} with {processor_name}.")

        # TODO: Maybe not upload such problematic tiny models to Hub.
        if tokenizer is None and processor is None:
@@ -256,7 +256,7 @@ class PipelineTesterMixin:
                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
                f"any tokenizer / processor from `{repo_id}`."
            )
-            return
+            self.skipTest(f"Could not find or load any tokenizer / processor from {repo_id}.")

        # TODO: We should check if a model file is on the Hub repo. instead.
        try:
@@ -266,7 +266,7 @@ class PipelineTesterMixin:
                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
                f"the model from `{repo_id}` with `{model_architecture}`."
            )
-            return
+            self.skipTest(f"Could not find or load the model from {repo_id} with {model_architecture}.")

        pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
@@ -275,7 +275,9 @@ class PipelineTesterMixin:
                f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
                f"`{tokenizer_name}` | processor `{processor_name}`."
            )
-            return
+            self.skipTest(
+                f"Test is known to fail for: model `{model_architecture.__name__}` | tokenizer `{tokenizer_name}` | processor `{processor_name}`."
+            )

        # validate
        validate_test_components(self, task, model, tokenizer, processor)
@@ -295,7 +297,7 @@ class PipelineTesterMixin:
                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not get the "
                "pipeline for testing."
            )
-            return
+            self.skipTest(reason="Could not get the pipeline for testing.")

        task_test.run_pipeline_test(pipeline, examples)


--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -417,7 +417,7 @@ class TokenizerTesterMixin:
    def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
        """Test ``_tokenize`` and ``convert_tokens_to_string``."""
        if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")

        tokenizer = self.get_tokenizer()
        text = "This is text to test the tokenizer."
@@ -449,7 +449,7 @@ class TokenizerTesterMixin:

    def test_sentencepiece_tokenize_and_decode(self):
        if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")

        text = "This is text to test the tokenizer."
        if self.test_rust_tokenizer:
@@ -466,7 +466,7 @@ class TokenizerTesterMixin:

    def test_subword_regularization_tokenizer(self) -> None:
        if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")

        # Subword regularization is only available for the slow tokenizer.
        sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
@@ -484,7 +484,7 @@ class TokenizerTesterMixin:

    def test_pickle_subword_regularization_tokenizer(self) -> None:
        if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")

        """Google pickle __getstate__ __setstate__ if you are struggling with this."""
        # Subword regularization is only available for the slow tokenizer.
@@ -506,7 +506,7 @@ class TokenizerTesterMixin:

    def test_save_sentencepiece_tokenizer(self) -> None:
        if not self.test_sentencepiece or not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
        # We want to verify that we will be able to save the tokenizer even if the original files that were used to
        # build the tokenizer have been deleted in the meantime.
        text = "This is text to test the tokenizer."
@@ -545,7 +545,7 @@ class TokenizerTesterMixin:

    def test_rust_tokenizer_signature(self):
        if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")

        signature = inspect.signature(self.rust_tokenizer_class.__init__)

@@ -554,7 +554,7 @@ class TokenizerTesterMixin:

    def test_tokenizer_slow_store_full_signature(self):
        if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        signature = inspect.signature(self.tokenizer_class.__init__)
        tokenizer = self.get_tokenizer()
@@ -565,7 +565,7 @@ class TokenizerTesterMixin:

    def test_tokenizer_fast_store_full_signature(self):
        if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")

        signature = inspect.signature(self.rust_tokenizer_class.__init__)
        tokenizer = self.get_rust_tokenizer()
@@ -580,11 +580,11 @@ class TokenizerTesterMixin:

    def test_rust_and_python_full_tokenizers(self):
        if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")

        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        tokenizer = self.get_tokenizer()
        rust_tokenizer = self.get_rust_tokenizer()
@@ -1973,7 +1973,7 @@ class TokenizerTesterMixin:
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                else:
                    empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
                    normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
@@ -2007,9 +2007,9 @@ class TokenizerTesterMixin:
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                if "attention_mask" not in tokenizer.model_input_names:
-                    self.skipTest("This model does not use attention mask.")
+                    self.skipTest(reason="This model does not use attention mask.")

                features = [
                    {"input_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
@@ -2126,7 +2126,7 @@ class TokenizerTesterMixin:

    def test_padding_warning_message_fast_tokenizer(self):
        if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")

        sequence = "This is a text"

@@ -2146,7 +2146,7 @@ class TokenizerTesterMixin:
        )

        if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        tokenizer_slow = self.get_tokenizer()
        # check correct behaviour if no pad_token_id exists and add it eventually
@@ -2295,8 +2295,8 @@ class TokenizerTesterMixin:
    @require_tokenizers
    def test_added_token_are_matched_longest_first(self):
        if not self.test_slow_tokenizer:
-            self.skipTest("This test is only for slow tokenizers")
-            return
+            self.skipTest(reason="This test is only for slow tokenizers")
+
        tokenizers = self.get_tokenizers(fast=False)
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -2305,7 +2305,7 @@ class TokenizerTesterMixin:
                    tokenizer.add_tokens([AddedToken("extra_id_100")])
                except Exception:
                    # Canine cannot add tokens which are not codepoints
-                    self.skipTest("Cannot add those Added tokens")
+                    self.skipTest(reason="Cannot add those Added tokens")

                # XXX: This used to split on `extra_id_1` first we're matching
                # longest first now.
@@ -2588,13 +2588,13 @@ class TokenizerTesterMixin:
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER")

                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                config = config_class()

                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")

                model = model_class(config)

@@ -2637,13 +2637,13 @@ class TokenizerTesterMixin:
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")

                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                config = config_class()

                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")

                model = model_class(config)

@@ -2672,13 +2672,13 @@ class TokenizerTesterMixin:
        for tokenizer in tokenizers:
            with self.subTest(f"{tokenizer.__class__.__name__}"):
                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")

                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                config = config_class()

                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skip("Model is not an encoder-decoder model or has no set pad token id")

                # Build sequence
                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
@@ -2712,7 +2712,7 @@ class TokenizerTesterMixin:
    @require_torch
    def test_prepare_seq2seq_batch(self):
        if not self.test_seq2seq:
-            return
+            self.skipTest(reason="test_seq2seq is set to False")

        tokenizers = self.get_tokenizers()
        for tokenizer in tokenizers:
@@ -2740,7 +2740,7 @@ class TokenizerTesterMixin:
                        src_lang="en_XX",  # this should be ignored (for all but mbart) but not cause an error
                    )
                except NotImplementedError:
-                    return
+                    self.skipTest(reason="Encountered NotImplementedError calling prepare_seq2seq_batch")
                self.assertEqual(batch.input_ids.shape[1], 3)
                self.assertEqual(batch.labels.shape[1], 10)
                # max_target_length will default to max_length if not specified
@@ -3008,7 +3008,7 @@ class TokenizerTesterMixin:
    def test_tokenization_python_rust_equals(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3049,7 +3049,7 @@ class TokenizerTesterMixin:
    def test_num_special_tokens_to_add_equal(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3067,7 +3067,7 @@ class TokenizerTesterMixin:
    def test_max_length_equal(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3081,7 +3081,7 @@ class TokenizerTesterMixin:
    def test_special_tokens_map_equal(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3177,10 +3177,10 @@ class TokenizerTesterMixin:
                elif is_flax_available():
                    returned_tensor = "jax"
                else:
-                    return
+                    self.skipTest(reason="No expected framework from PT, TF or JAX found")

                if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
-                    return
+                    self.skipTest(reason="This tokenizer has no padding token set, or pad_token_id < 0")

                tokens = tokenizer.encode_plus(
                    "HuggingFace is solving NLP one commit at a time",
@@ -3225,7 +3225,7 @@ class TokenizerTesterMixin:
    def test_compare_pretokenized_inputs(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3307,7 +3307,7 @@ class TokenizerTesterMixin:
    def test_create_token_type_ids(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3329,7 +3329,7 @@ class TokenizerTesterMixin:
    def test_build_inputs_with_special_tokens(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3374,7 +3374,7 @@ class TokenizerTesterMixin:
    def test_padding(self, max_length=50):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3600,7 +3600,7 @@ class TokenizerTesterMixin:
    def test_padding_different_model_input_name(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3638,7 +3638,7 @@ class TokenizerTesterMixin:
    def test_save_pretrained(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3713,7 +3713,7 @@ class TokenizerTesterMixin:
    def test_embeded_special_tokens(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3781,7 +3781,7 @@ class TokenizerTesterMixin:
    def test_compare_prepare_for_model(self):
        if not self.test_slow_tokenizer:
            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3884,7 +3884,7 @@ class TokenizerTesterMixin:
    def test_training_new_tokenizer(self):
        # This feature only exists for fast tokenizers
        if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")

        tokenizer = self.get_rust_tokenizer()
        new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -3919,7 +3919,7 @@ class TokenizerTesterMixin:
    def test_training_new_tokenizer_with_special_tokens_change(self):
        # This feature only exists for fast tokenizers
        if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")

        tokenizer = self.get_rust_tokenizer()
        # Test with a special tokens map
@@ -4092,7 +4092,7 @@ class TokenizerTesterMixin:
    def test_save_slow_from_fast_and_reload_fast(self):
        if not self.test_slow_tokenizer or not self.test_rust_tokenizer:
            # we need both slow and fast versions
-            return
+            self.skipTest(reason="test_rust_tokenizer or test_slow_tokenizer is set to False")

        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -4166,7 +4166,7 @@ class TokenizerTesterMixin:

    def test_split_special_tokens(self):
        if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
        # Tests the expected appearance (or absence) of special token in encoded output,
        # explicit values are not tested because tokenization is model dependent and can change
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:

--- a/tests/test_tokenization_utils.py
+++ b/tests/test_tokenization_utils.py
@@ -96,7 +96,7 @@ class TokenizerUtilTester(unittest.TestCase):
        # the current folder and have the right name.
        if os.path.isfile("tokenizer.json"):
            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
-            return
+            self.skipTest(reason="Skipping test as there is a `tokenizer.json` file in the current folder.")
        try:
            with open("tokenizer.json", "wb") as f:
                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)

--- a/tests/tokenization/test_tokenization_fast.py
+++ b/tests/tokenization/test_tokenization_fast.py
@@ -47,9 +47,10 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
        tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
        tokenizer.save_pretrained(self.tmpdirname)

+    @unittest.skip(
+        "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"
+    )
    def test_tokenizer_mismatch_warning(self):
-        # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
-        # model
        pass

    @unittest.skip(
@@ -70,13 +71,12 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
    def test_additional_special_tokens_serialization(self):
        pass

+    @unittest.skip(reason="PreTrainedTokenizerFast is the only tokenizer that is not linked to any model")
    def test_prepare_for_model(self):
-        # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
-        # model
        pass

+    @unittest.skip(reason="PreTrainedTokenizerFast doesn't have tokenizer_file in its signature")
    def test_rust_tokenizer_signature(self):
-        # PreTrainedTokenizerFast doesn't have tokenizer_file in its signature
        pass

    def test_training_new_tokenizer(self):

--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -3066,7 +3066,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
        torchdynamo.reset()

-    @unittest.skip("torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.")
+    @unittest.skip(reason="torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.")
    @require_torch_non_multi_gpu
    @require_torchdynamo
    def test_torchdynamo_memory(self):
@@ -3668,7 +3668,7 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
    def test_push_to_hub_with_saves_each_n_steps(self):
        num_gpus = max(1, backend_device_count(torch_device))
        if num_gpus > 2:
-            return
+            self.skipTest(reason="More than 2 GPUs available")

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(

--- a/tests/utils/test_doc_samples.py
+++ b/tests/utils/test_doc_samples.py
@@ -26,7 +26,7 @@ from transformers.testing_utils import require_tf, require_torch, slow
 logger = logging.getLogger()


-@unittest.skip("Temporarily disable the doc tests.")
+@unittest.skip(reason="Temporarily disable the doc tests.")
 @require_torch
 @require_tf
 @slow

--- a/tests/utils/test_model_output.py
+++ b/tests/utils/test_model_output.py
@@ -159,11 +159,11 @@ class ModelOutputTester(unittest.TestCase):
            )

    # TODO: @ydshieh
-    @unittest.skip("CPU OOM")
+    @unittest.skip(reason="CPU OOM")
    @require_torch
    def test_export_serialization(self):
        if not is_torch_greater_or_equal_than_2_2:
-            return
+            self.skipTest(reason="Export serialization requires torch >= 2.2.0")

        model_cls = AlbertForMaskedLM
        model_config = model_cls.config_class()