Fast Tests on PR improvements: Batch Tests fixes (#5080)

* fix test * initial commit * change test * updates: * fix tests * test fix * test fix * fix tests * make test faster * clean up * fix precision in test * fix precision * Fix tests * Fix logging test * fix test * fix test --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Fast Tests on PR improvements: Batch Tests fixes (#5080)
* fix test * initial commit * change test * updates: * fix tests * test fix * test fix * fix tests * make test faster * clean up * fix precision in test * fix precision * Fix tests * Fix logging test * fix test * fix test --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
29970757 · Dhruv Nair · GitHub · c2787c11 · 29970757 · 29970757
Unverified Commit 29970757 authored Sep 19, 2023 by Dhruv Nair Committed by GitHub Sep 19, 2023
20 changed files
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -359,7 +359,7 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
+        self._test_inference_batch_single_identical()
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),

--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -459,7 +459,7 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def test_inference_batch_single_identical(self):
        # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
-        self._test_inference_batch_single_identical(test_mean_pixel_difference=False, expected_max_diff=2e-4)
+        self._test_inference_batch_single_identical(expected_max_diff=2e-4)
    def test_save_load_local(self):
        # increase tolerance from 1e-4 -> 2e-4 to account for large composite model

--- a/tests/pipelines/dit/test_dit.py
+++ b/tests/pipelines/dit/test_dit.py
@@ -96,7 +96,7 @@ class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        self.assertLessEqual(max_diff, 1e-3)
    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3)
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),

--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -224,15 +224,7 @@ class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @skip_mps
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
+        self._test_inference_batch_single_identical(expected_max_diff=1e-2)
-        relax_max_difference = True
-        test_mean_pixel_difference = False
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            test_mean_pixel_difference=test_mean_pixel_difference,
-        )
    @skip_mps
    def test_attention_slicing_forward_pass(self):

--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
@@ -224,15 +224,7 @@ class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
    @skip_mps
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
-        relax_max_difference = True
-        test_mean_pixel_difference = False
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            test_mean_pixel_difference=test_mean_pixel_difference,
-        )
    @skip_mps
    def test_attention_slicing_forward_pass(self):

--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
@@ -234,15 +234,7 @@ class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.Te
    @skip_mps
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
+        self._test_inference_batch_single_identical(expected_max_diff=1e-2)
-        relax_max_difference = True
-        test_mean_pixel_difference = False
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            test_mean_pixel_difference=test_mean_pixel_difference,
-        )
    @skip_mps
    def test_attention_slicing_forward_pass(self):

--- a/tests/pipelines/musicldm/test_musicldm.py
+++ b/tests/pipelines/musicldm/test_musicldm.py
@@ -373,7 +373,7 @@ class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
+        self._test_inference_batch_single_identical()
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),

--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -44,11 +44,11 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @property
    def text_embedder_hidden_size(self):
-        return 32
+        return 16
    @property
    def time_input_dim(self):
-        return 32
+        return 16
    @property
    def time_embed_dim(self):
@@ -201,14 +201,7 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        self._test_inference_batch_consistent(batch_sizes=[1, 2])
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
+        self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=6e-3)
-        relax_max_difference = True
-        self._test_inference_batch_single_identical(
-            batch_size=2,
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-        )
    def test_num_images_per_prompt(self):
        components = self.get_dummy_components()

--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -52,11 +52,11 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @property
    def text_embedder_hidden_size(self):
-        return 32
+        return 16
    @property
    def time_input_dim(self):
-        return 32
+        return 16
    @property
    def time_embed_dim(self):
@@ -71,10 +71,10 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        torch.manual_seed(0)
        config = CLIPVisionConfig(
            hidden_size=self.text_embedder_hidden_size,
-            image_size=64,
+            image_size=32,
            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
+            intermediate_size=24,
-            num_attention_heads=4,
+            num_attention_heads=2,
            num_channels=3,
            num_hidden_layers=5,
            patch_size=1,
@@ -170,7 +170,7 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        return components
    def get_dummy_inputs(self, device, seed=0):
-        input_image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
@@ -219,15 +219,12 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def test_inference_batch_consistent(self):
        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
-        self._test_inference_batch_consistent(batch_sizes=[1, 2])
+        self._test_inference_batch_consistent(batch_sizes=[2])
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
-        relax_max_difference = True
        self._test_inference_batch_single_identical(
            batch_size=2,
-            test_max_difference=test_max_difference,
+            expected_max_diff=5e-3,
-            relax_max_difference=relax_max_difference,
        )
    def test_num_images_per_prompt(self):

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -499,14 +499,7 @@ class StableDiffusionPipelineFastTests(
        negative_prompt = None
        num_images_per_prompt = 1
        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger.setLevel(logging.WARNING)
-        prompt = 25 * "@"
-        with CaptureLogger(logger) as cap_logger_3:
-            negative_text_embeddings_3, text_embeddings_3 = sd_pipe.encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-            if negative_text_embeddings_3 is not None:
-                text_embeddings_3 = torch.cat([negative_text_embeddings_3, text_embeddings_3])
        prompt = 100 * "@"
        with CaptureLogger(logger) as cap_logger:
@@ -516,6 +509,9 @@ class StableDiffusionPipelineFastTests(
            if negative_text_embeddings is not None:
                text_embeddings = torch.cat([negative_text_embeddings, text_embeddings])
+        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
+        assert cap_logger.out.count("@") == 25
        negative_prompt = "Hello"
        with CaptureLogger(logger) as cap_logger_2:
            negative_text_embeddings_2, text_embeddings_2 = sd_pipe.encode_prompt(
@@ -524,12 +520,18 @@ class StableDiffusionPipelineFastTests(
            if negative_text_embeddings_2 is not None:
                text_embeddings_2 = torch.cat([negative_text_embeddings_2, text_embeddings_2])
+        assert cap_logger.out == cap_logger_2.out
+        prompt = 25 * "@"
+        with CaptureLogger(logger) as cap_logger_3:
+            negative_text_embeddings_3, text_embeddings_3 = sd_pipe.encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+            if negative_text_embeddings_3 is not None:
+                text_embeddings_3 = torch.cat([negative_text_embeddings_3, text_embeddings_3])
        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
        assert text_embeddings.shape[1] == 77
-        assert cap_logger.out == cap_logger_2.out
-        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
-        assert cap_logger.out.count("@") == 25
        assert cap_logger_3.out == ""
    def test_stable_diffusion_height_width_opt(self):

--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -250,6 +250,7 @@ class StableDiffusion2PipelineFastTests(
        negative_prompt = None
        num_images_per_prompt = 1
        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger.setLevel(logging.WARNING)
        prompt = 25 * "@"
        with CaptureLogger(logger) as cap_logger_3:

--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -182,9 +182,7 @@ class StableUnCLIPPipelineFastTests(
    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
    # because UnCLIP undeterminism requires a looser check.
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device in ["cpu", "mps"]
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
-        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
 @slow

--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -196,9 +196,7 @@ class StableUnCLIPImg2ImgPipelineFastTests(
    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
    # because undeterminism requires a looser check.
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device in ["cpu", "mps"]
+        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
-        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),

--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -374,11 +374,11 @@ class PipelineTesterMixin:
            f"Required optional parameters not present: {remaining_required_optional_parameters}",
        )
-    def test_inference_batch_consistent(self, batch_sizes=[2, 4, 13]):
+    def test_inference_batch_consistent(self, batch_sizes=[2]):
        self._test_inference_batch_consistent(batch_sizes=batch_sizes)
    def _test_inference_batch_consistent(
-        self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"]
+        self, batch_sizes=[2], additional_params_copy_to_batched_inputs=["num_inference_steps"]
    ):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
@@ -386,137 +386,103 @@ class PipelineTesterMixin:
        pipe.set_progress_bar_config(disable=None)
        inputs = self.get_dummy_inputs(torch_device)
+        inputs["generator"] = self.get_generator(0)
        logger = logging.get_logger(pipe.__module__)
        logger.setLevel(level=diffusers.logging.FATAL)
-        # batchify inputs
+        # prepare batched inputs
+        batched_inputs = []
        for batch_size in batch_sizes:
-            batched_inputs = {}
+            batched_input = {}
-            for name, value in inputs.items():
+            batched_input.update(inputs)
-                if name in self.batch_params:
-                    # prompt is string
-                    if name == "prompt":
-                        len_prompt = len(value)
-                        # make unequal batch sizes
-                        batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                        # make last batch super long
-                        batched_inputs[name][-1] = 100 * "very long"
-                    # or else we have images
-                    else:
-                        batched_inputs[name] = batch_size * [value]
-                elif name == "batch_size":
-                    batched_inputs[name] = batch_size
-                else:
-                    batched_inputs[name] = value
-            for arg in additional_params_copy_to_batched_inputs:
+            for name in self.batch_params:
-                batched_inputs[arg] = inputs[arg]
+                if name not in inputs:
+                    continue
-            batched_inputs["output_type"] = "np"
+                value = inputs[name]
+                if name == "prompt":
-            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
+                    len_prompt = len(value)
-                batched_inputs.pop("output_type")
+                    # make unequal batch sizes
+                    batched_input[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-            output = pipe(**batched_inputs)
-            assert len(output[0]) == batch_size
+                    # make last batch super long
+                    batched_input[name][-1] = 100 * "very long"
-            batched_inputs["output_type"] = "np"
+                else:
+                    batched_input[name] = batch_size * [value]
-            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
+            if "generator" in inputs:
-                batched_inputs.pop("output_type")
+                batched_input["generator"] = [self.get_generator(i) for i in range(batch_size)]
-            output = pipe(**batched_inputs)[0]
+            if "batch_size" in inputs:
+                batched_input["batch_size"] = batch_size
-            assert output.shape[0] == batch_size
+            batched_inputs.append(batched_input)
        logger.setLevel(level=diffusers.logging.WARNING)
+        for batch_size, batched_input in zip(batch_sizes, batched_inputs):
+            output = pipe(**batched_input)
+            assert len(output[0]) == batch_size
    def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-4):
        self._test_inference_batch_single_identical(batch_size=batch_size, expected_max_diff=expected_max_diff)
    def _test_inference_batch_single_identical(
        self,
-        batch_size=3,
+        batch_size=2,
-        test_max_difference=None,
-        test_mean_pixel_difference=None,
-        relax_max_difference=False,
        expected_max_diff=1e-4,
        additional_params_copy_to_batched_inputs=["num_inference_steps"],
    ):
-        if test_max_difference is None:
-            # TODO(Pedro) - not sure why, but not at all reproducible at the moment it seems
-            # make sure that batched and non-batched is identical
-            test_max_difference = torch_device != "mps"
-        if test_mean_pixel_difference is None:
-            # TODO same as above
-            test_mean_pixel_difference = torch_device != "mps"
-        generator_device = "cpu"
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
-        inputs = self.get_dummy_inputs(generator_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
        logger = logging.get_logger(pipe.__module__)
        logger.setLevel(level=diffusers.logging.FATAL)
        # batchify inputs
        batched_inputs = {}
-        batch_size = batch_size
+        batched_inputs.update(inputs)
-        for name, value in inputs.items():
-            if name in self.batch_params:
-                # prompt is string
-                if name == "prompt":
-                    len_prompt = len(value)
-                    # make unequal batch sizes
-                    batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                    # make last batch super long
+        for name in self.batch_params:
-                    batched_inputs[name][-1] = 100 * "very long"
+            if name not in inputs:
-                # or else we have images
+                continue
-                else:
-                    batched_inputs[name] = batch_size * [value]
-            elif name == "batch_size":
-                batched_inputs[name] = batch_size
-            elif name == "generator":
-                batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)]
-            else:
-                batched_inputs[name] = value
-        for arg in additional_params_copy_to_batched_inputs:
+            value = inputs[name]
-            batched_inputs[arg] = inputs[arg]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
-        if self.pipeline_class.__name__ != "DanceDiffusionPipeline":
+            else:
-            batched_inputs["output_type"] = "np"
+                batched_inputs[name] = batch_size * [value]
-        output_batch = pipe(**batched_inputs)
+        if "generator" in inputs:
-        assert output_batch[0].shape[0] == batch_size
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
-        inputs["generator"] = self.get_generator(0)
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
-        logger.setLevel(level=diffusers.logging.WARNING)
+        assert output_batch[0].shape[0] == batch_size
-        if test_max_difference:
-            if relax_max_difference:
-                # Taking the median of the largest <n> differences
-                # is resilient to outliers
-                diff = np.abs(output_batch[0][0] - output[0][0])
-                diff = diff.flatten()
-                diff.sort()
-                max_diff = np.median(diff[-5:])
-            else:
-                max_diff = np.abs(output_batch[0][0] - output[0][0]).max()
-            assert max_diff < expected_max_diff
-        if test_mean_pixel_difference:
+        max_diff = np.abs(output_batch[0][0] - output[0][0]).max()
-            assert_mean_pixel_difference(output_batch[0][0], output[0][0])
+        assert max_diff < expected_max_diff
    def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
        components = self.get_dummy_components()
@@ -528,8 +494,9 @@ class PipelineTesterMixin:
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
-        output = pipe(**self.get_dummy_inputs(torch_device))[0]
+        generator_device = "cpu"
-        output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
+        output = pipe(**self.get_dummy_inputs(generator_device))[0]
+        output_tuple = pipe(**self.get_dummy_inputs(generator_device), return_dict=False)[0]
        max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
        self.assertLess(max_diff, expected_max_difference)
@@ -710,11 +677,12 @@ class PipelineTesterMixin:
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
        output_without_slicing = pipe(**inputs)[0]
        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs(generator_device)
        output_with_slicing = pipe(**inputs)[0]
        if test_max_difference:

--- a/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/tests/pipelines/text_to_video/test_text_to_video.py
@@ -62,14 +62,14 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def get_dummy_components(self):
        torch.manual_seed(0)
        unet = UNet3DConditionModel(
-            block_out_channels=(32, 64, 64, 64),
+            block_out_channels=(32, 32),
            layers_per_block=2,
            sample_size=32,
            in_channels=4,
            out_channels=4,
-            down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
-            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
-            cross_attention_dim=32,
+            cross_attention_dim=4,
            attention_head_dim=4,
        )
        scheduler = DDIMScheduler(
@@ -81,27 +81,27 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        )
        torch.manual_seed(0)
        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
+            block_out_channels=(32,),
            in_channels=3,
            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            down_block_types=["DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D"],
            latent_channels=4,
-            sample_size=128,
+            sample_size=32,
        )
        torch.manual_seed(0)
        text_encoder_config = CLIPTextConfig(
            bos_token_id=0,
            eos_token_id=2,
-            hidden_size=32,
+            hidden_size=4,
-            intermediate_size=37,
+            intermediate_size=16,
            layer_norm_eps=1e-05,
-            num_attention_heads=4,
+            num_attention_heads=2,
-            num_hidden_layers=5,
+            num_hidden_layers=2,
            pad_token_id=1,
            vocab_size=1000,
            hidden_act="gelu",
-            projection_dim=512,
+            projection_dim=32,
        )
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
@@ -141,8 +141,8 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        frames = sd_pipe(**inputs).frames
        image_slice = frames[0][-3:, -3:, -1]
-        assert frames[0].shape == (64, 64, 3)
+        assert frames[0].shape == (32, 32, 3)
-        expected_slice = np.array([158.0, 160.0, 153.0, 125.0, 100.0, 121.0, 111.0, 93.0, 113.0])
+        expected_slice = np.array([91.0, 152.0, 66.0, 192.0, 94.0, 126.0, 101.0, 123.0, 152.0])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

--- a/tests/pipelines/text_to_video/test_video_to_video.py
+++ b/tests/pipelines/text_to_video/test_video_to_video.py
@@ -82,7 +82,7 @@ class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
-            clip_sample=False,
+            clip_sample=True,
            set_alpha_to_one=False,
        )
        torch.manual_seed(0)

--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -373,8 +373,6 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    # because UnCLIP undeterminism requires a looser check.
    @skip_mps
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
-        relax_max_difference = True
        additional_params_copy_to_batched_inputs = [
            "prior_num_inference_steps",
            "decoder_num_inference_steps",
@@ -382,9 +380,7 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        ]
        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
+            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3
-            relax_max_difference=relax_max_difference,
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
        )
    def test_inference_batch_consistent(self):

--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -448,17 +448,12 @@ class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCa
    # because UnCLIP undeterminism requires a looser check.
    @skip_mps
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
-        relax_max_difference = True
        additional_params_copy_to_batched_inputs = [
            "decoder_num_inference_steps",
            "super_res_num_inference_steps",
        ]
        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
+            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3
-            relax_max_difference=relax_max_difference,
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
        )
    def test_inference_batch_consistent(self):

--- a/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py
@@ -170,15 +170,7 @@ class WuerstchenDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
    @skip_mps
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
+        self._test_inference_batch_single_identical(expected_max_diff=1e-5)
-        relax_max_difference = True
-        test_mean_pixel_difference = False
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            test_mean_pixel_difference=test_mean_pixel_difference,
-        )
    @skip_mps
    def test_attention_slicing_forward_pass(self):

--- a/tests/pipelines/wuerstchen/test_wuerstchen_prior.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_prior.py
@@ -166,14 +166,7 @@ class WuerstchenPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @skip_mps
    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
-        relax_max_difference = True
-        test_mean_pixel_difference = False
        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            test_mean_pixel_difference=test_mean_pixel_difference,
            expected_max_diff=2e-1,
        )