[tests] remove tests for deprecated pipelines. (#11879)

* remove tests for deprecated pipelines. * remove folders * test_pipelines_common

[tests] remove tests for deprecated pipelines. (#11879)
* remove tests for deprecated pipelines. * remove folders * test_pipelines_common
bc55b631 · Sayak Paul · GitHub · 15d50f16 · 15d50f16 · 15d50f16
Unverified Commit bc55b631 authored Jul 08, 2025 by Sayak Paul Committed by GitHub Jul 08, 2025
20 changed files
--- a/tests/pipelines/amused/__init__.py
+++ b/tests/pipelines/amused/__init__.py
--- a/tests/pipelines/amused/test_amused.py
+++ b/tests/pipelines/amused/test_amused.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
-from diffusers import AmusedPipeline, AmusedScheduler, UVit2DModel, VQModel
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    require_torch_accelerator,
-    slow,
-    torch_device,
-)
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AmusedPipeline
-    params = TEXT_TO_IMAGE_PARAMS | {"encoder_hidden_states", "negative_encoder_hidden_states"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    test_layerwise_casting = True
-    test_group_offloading = True
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = UVit2DModel(
-            hidden_size=8,
-            use_bias=False,
-            hidden_dropout=0.0,
-            cond_embed_dim=8,
-            micro_cond_encode_dim=2,
-            micro_cond_embed_dim=10,
-            encoder_hidden_size=8,
-            vocab_size=32,
-            codebook_size=8,
-            in_channels=8,
-            block_out_channels=8,
-            num_res_blocks=1,
-            downsample=True,
-            upsample=True,
-            block_num_heads=1,
-            num_hidden_layers=1,
-            num_attention_heads=1,
-            attention_dropout=0.0,
-            intermediate_size=8,
-            layer_norm_eps=1e-06,
-            ln_elementwise_affine=True,
-        )
-        scheduler = AmusedScheduler(mask_token_id=31)
-        torch.manual_seed(0)
-        vqvae = VQModel(
-            act_fn="silu",
-            block_out_channels=[8],
-            down_block_types=["DownEncoderBlock2D"],
-            in_channels=3,
-            latent_channels=8,
-            layers_per_block=1,
-            norm_num_groups=8,
-            num_vq_embeddings=8,
-            out_channels=3,
-            sample_size=8,
-            up_block_types=["UpDecoderBlock2D"],
-            mid_block_add_attention=False,
-            lookup_from_codebook=True,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=8,
-            intermediate_size=8,
-            layer_norm_eps=1e-05,
-            num_attention_heads=1,
-            num_hidden_layers=1,
-            pad_token_id=1,
-            vocab_size=1000,
-            projection_dim=8,
-        )
-        text_encoder = CLIPTextModelWithProjection(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "transformer": transformer,
-            "scheduler": scheduler,
-            "vqvae": vqvae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "np",
-            "height": 4,
-            "width": 4,
-        }
-        return inputs
-    def test_inference_batch_consistent(self, batch_sizes=[2]):
-        self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
-    @unittest.skip("aMUSEd does not support lists of generators")
-    def test_inference_batch_single_identical(self): ...
-@slow
-@require_torch_accelerator
-class AmusedPipelineSlowTests(unittest.TestCase):
-    def test_amused_256(self):
-        pipe = AmusedPipeline.from_pretrained("amused/amused-256")
-        pipe.to(torch_device)
-        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.4011, 0.3992, 0.379, 0.3856, 0.3772, 0.3711, 0.3919, 0.385, 0.3625])
-        assert np.abs(image_slice - expected_slice).max() < 0.003
-    def test_amused_256_fp16(self):
-        pipe = AmusedPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.0554, 0.05129, 0.0344, 0.0452, 0.0476, 0.0271, 0.0495, 0.0527, 0.0158])
-        assert np.abs(image_slice - expected_slice).max() < 0.007
-    def test_amused_512(self):
-        pipe = AmusedPipeline.from_pretrained("amused/amused-512")
-        pipe.to(torch_device)
-        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1199, 0.1171, 0.1229, 0.1188, 0.1210, 0.1147, 0.1260, 0.1346, 0.1152])
-        assert np.abs(image_slice - expected_slice).max() < 0.003
-    def test_amused_512_fp16(self):
-        pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1509, 0.1492, 0.1531, 0.1485, 0.1501, 0.1465, 0.1581, 0.1690, 0.1499])
-        assert np.abs(image_slice - expected_slice).max() < 0.003
--- a/tests/pipelines/amused/test_amused_img2img.py
+++ b/tests/pipelines/amused/test_amused_img2img.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
-from diffusers import AmusedImg2ImgPipeline, AmusedScheduler, UVit2DModel, VQModel
-from diffusers.utils import load_image
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    require_torch_accelerator,
-    slow,
-    torch_device,
-)
-from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AmusedImg2ImgPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "latents"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = UVit2DModel(
-            hidden_size=8,
-            use_bias=False,
-            hidden_dropout=0.0,
-            cond_embed_dim=8,
-            micro_cond_encode_dim=2,
-            micro_cond_embed_dim=10,
-            encoder_hidden_size=8,
-            vocab_size=32,
-            codebook_size=8,
-            in_channels=8,
-            block_out_channels=8,
-            num_res_blocks=1,
-            downsample=True,
-            upsample=True,
-            block_num_heads=1,
-            num_hidden_layers=1,
-            num_attention_heads=1,
-            attention_dropout=0.0,
-            intermediate_size=8,
-            layer_norm_eps=1e-06,
-            ln_elementwise_affine=True,
-        )
-        scheduler = AmusedScheduler(mask_token_id=31)
-        torch.manual_seed(0)
-        vqvae = VQModel(
-            act_fn="silu",
-            block_out_channels=[8],
-            down_block_types=["DownEncoderBlock2D"],
-            in_channels=3,
-            latent_channels=8,
-            layers_per_block=1,
-            norm_num_groups=8,
-            num_vq_embeddings=32,
-            out_channels=3,
-            sample_size=8,
-            up_block_types=["UpDecoderBlock2D"],
-            mid_block_add_attention=False,
-            lookup_from_codebook=True,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=8,
-            intermediate_size=8,
-            layer_norm_eps=1e-05,
-            num_attention_heads=1,
-            num_hidden_layers=1,
-            pad_token_id=1,
-            vocab_size=1000,
-            projection_dim=8,
-        )
-        text_encoder = CLIPTextModelWithProjection(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "transformer": transformer,
-            "scheduler": scheduler,
-            "vqvae": vqvae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "np",
-            "image": image,
-        }
-        return inputs
-    def test_inference_batch_consistent(self, batch_sizes=[2]):
-        self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
-    @unittest.skip("aMUSEd does not support lists of generators")
-    def test_inference_batch_single_identical(self): ...
-@slow
-@require_torch_accelerator
-class AmusedImg2ImgPipelineSlowTests(unittest.TestCase):
-    def test_amused_256(self):
-        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256")
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
-            .resize((256, 256))
-            .convert("RGB")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.9993, 1.0, 0.9996, 1.0, 0.9995, 0.9925, 0.999, 0.9954, 1.0])
-        assert np.abs(image_slice - expected_slice).max() < 0.01
-    def test_amused_256_fp16(self):
-        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256", torch_dtype=torch.float16, variant="fp16")
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
-            .resize((256, 256))
-            .convert("RGB")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.998, 0.998, 0.994, 0.9944, 0.996, 0.9908, 1.0, 1.0, 0.9986])
-        assert np.abs(image_slice - expected_slice).max() < 0.01
-    def test_amused_512(self):
-        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512")
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
-            .resize((512, 512))
-            .convert("RGB")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2809, 0.1879, 0.2027, 0.2418, 0.1852, 0.2145, 0.2484, 0.2425, 0.2317])
-        assert np.abs(image_slice - expected_slice).max() < 0.1
-    def test_amused_512_fp16(self):
-        pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
-            .resize((512, 512))
-            .convert("RGB")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2795, 0.1867, 0.2028, 0.2450, 0.1856, 0.2140, 0.2473, 0.2406, 0.2313])
-        assert np.abs(image_slice - expected_slice).max() < 0.1
--- a/tests/pipelines/amused/test_amused_inpaint.py
+++ b/tests/pipelines/amused/test_amused_inpaint.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
-from diffusers import AmusedInpaintPipeline, AmusedScheduler, UVit2DModel, VQModel
-from diffusers.utils import load_image
-from diffusers.utils.testing_utils import (
-    Expectations,
-    enable_full_determinism,
-    require_torch_accelerator,
-    slow,
-    torch_device,
-)
-from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AmusedInpaintPipeline
-    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = UVit2DModel(
-            hidden_size=8,
-            use_bias=False,
-            hidden_dropout=0.0,
-            cond_embed_dim=8,
-            micro_cond_encode_dim=2,
-            micro_cond_embed_dim=10,
-            encoder_hidden_size=8,
-            vocab_size=32,
-            codebook_size=32,
-            in_channels=8,
-            block_out_channels=8,
-            num_res_blocks=1,
-            downsample=True,
-            upsample=True,
-            block_num_heads=1,
-            num_hidden_layers=1,
-            num_attention_heads=1,
-            attention_dropout=0.0,
-            intermediate_size=8,
-            layer_norm_eps=1e-06,
-            ln_elementwise_affine=True,
-        )
-        scheduler = AmusedScheduler(mask_token_id=31)
-        torch.manual_seed(0)
-        vqvae = VQModel(
-            act_fn="silu",
-            block_out_channels=[8],
-            down_block_types=["DownEncoderBlock2D"],
-            in_channels=3,
-            latent_channels=8,
-            layers_per_block=1,
-            norm_num_groups=8,
-            num_vq_embeddings=32,
-            out_channels=3,
-            sample_size=8,
-            up_block_types=["UpDecoderBlock2D"],
-            mid_block_add_attention=False,
-            lookup_from_codebook=True,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=8,
-            intermediate_size=8,
-            layer_norm_eps=1e-05,
-            num_attention_heads=1,
-            num_hidden_layers=1,
-            pad_token_id=1,
-            vocab_size=1000,
-            projection_dim=8,
-        )
-        text_encoder = CLIPTextModelWithProjection(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "transformer": transformer,
-            "scheduler": scheduler,
-            "vqvae": vqvae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
-        mask_image = torch.full((1, 1, 4, 4), 1.0, dtype=torch.float32, device=device)
-        mask_image[0, 0, 0, 0] = 0
-        mask_image[0, 0, 0, 1] = 0
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "np",
-            "image": image,
-            "mask_image": mask_image,
-        }
-        return inputs
-    def test_inference_batch_consistent(self, batch_sizes=[2]):
-        self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
-    @unittest.skip("aMUSEd does not support lists of generators")
-    def test_inference_batch_single_identical(self): ...
-@slow
-@require_torch_accelerator
-class AmusedInpaintPipelineSlowTests(unittest.TestCase):
-    def test_amused_256(self):
-        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256")
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
-            .resize((256, 256))
-            .convert("RGB")
-        )
-        mask_image = (
-            load_image(
-                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
-            )
-            .resize((256, 256))
-            .convert("L")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            mask_image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.0699, 0.0716, 0.0608, 0.0715, 0.0797, 0.0638, 0.0802, 0.0924, 0.0634])
-        assert np.abs(image_slice - expected_slice).max() < 0.1
-    def test_amused_256_fp16(self):
-        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
-            .resize((256, 256))
-            .convert("RGB")
-        )
-        mask_image = (
-            load_image(
-                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
-            )
-            .resize((256, 256))
-            .convert("L")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            mask_image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.0735, 0.0749, 0.065, 0.0739, 0.0805, 0.0667, 0.0802, 0.0923, 0.0622])
-        assert np.abs(image_slice - expected_slice).max() < 0.1
-    def test_amused_512(self):
-        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512")
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
-            .resize((512, 512))
-            .convert("RGB")
-        )
-        mask_image = (
-            load_image(
-                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
-            )
-            .resize((512, 512))
-            .convert("L")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            mask_image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005, 0.0])
-        assert np.abs(image_slice - expected_slice).max() < 0.05
-    def test_amused_512_fp16(self):
-        pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        image = (
-            load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
-            .resize((512, 512))
-            .convert("RGB")
-        )
-        mask_image = (
-            load_image(
-                "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
-            )
-            .resize((512, 512))
-            .convert("L")
-        )
-        image = pipe(
-            "winter mountains",
-            image,
-            mask_image,
-            generator=torch.Generator().manual_seed(0),
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slices = Expectations(
-            {
-                ("xpu", 3): np.array(
-                    [
-                        0.0274,
-                        0.0211,
-                        0.0154,
-                        0.0257,
-                        0.0299,
-                        0.0170,
-                        0.0326,
-                        0.0420,
-                        0.0150,
-                    ]
-                ),
-                ("cuda", 7): np.array(
-                    [
-                        0.0227,
-                        0.0157,
-                        0.0098,
-                        0.0213,
-                        0.0250,
-                        0.0127,
-                        0.0280,
-                        0.0380,
-                        0.0095,
-                    ]
-                ),
-            }
-        )
-        expected_slice = expected_slices.get_expectation()
-        assert np.abs(image_slice - expected_slice).max() < 0.003
--- a/tests/pipelines/audioldm/__init__.py
+++ b/tests/pipelines/audioldm/__init__.py
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import unittest
-import numpy as np
-import torch
-import torch.nn.functional as F
-from transformers import (
-    ClapTextConfig,
-    ClapTextModelWithProjection,
-    RobertaTokenizer,
-    SpeechT5HifiGan,
-    SpeechT5HifiGanConfig,
-)
-from diffusers import (
-    AudioLDMPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.utils import is_xformers_available
-from diffusers.utils.testing_utils import backend_empty_cache, enable_full_determinism, nightly, torch_device
-from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AudioLDMPipeline
-    params = TEXT_TO_AUDIO_PARAMS
-    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_waveforms_per_prompt",
-            "generator",
-            "latents",
-            "output_type",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-    supports_dduf = False
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(8, 16),
-            layers_per_block=1,
-            norm_num_groups=8,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=(8, 16),
-            class_embed_type="simple_projection",
-            projection_class_embeddings_input_dim=8,
-            class_embeddings_concat=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[8, 16],
-            in_channels=1,
-            out_channels=1,
-            norm_num_groups=8,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = ClapTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=8,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=1,
-            num_hidden_layers=1,
-            pad_token_id=1,
-            vocab_size=1000,
-            projection_dim=8,
-        )
-        text_encoder = ClapTextModelWithProjection(text_encoder_config)
-        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
-        vocoder_config = SpeechT5HifiGanConfig(
-            model_in_dim=8,
-            sampling_rate=16000,
-            upsample_initial_channel=16,
-            upsample_rates=[2, 2],
-            upsample_kernel_sizes=[4, 4],
-            resblock_kernel_sizes=[3, 7],
-            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
-            normalize_before=False,
-        )
-        vocoder = SpeechT5HifiGan(vocoder_config)
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "vocoder": vocoder,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-        }
-        return inputs
-    def test_audioldm_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(**inputs)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 256
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
-        )
-        assert np.abs(audio_slice - expected_slice).max() < 1e-2
-    def test_audioldm_prompt_embeds(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-        text_inputs = audioldm_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=audioldm_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_inputs = text_inputs["input_ids"].to(torch_device)
-        prompt_embeds = audioldm_pipe.text_encoder(
-            text_inputs,
-        )
-        prompt_embeds = prompt_embeds.text_embeds
-        # additional L_2 normalization over each hidden-state
-        prompt_embeds = F.normalize(prompt_embeds, dim=-1)
-        inputs["prompt_embeds"] = prompt_embeds
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-    def test_audioldm_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = audioldm_pipe.tokenizer(
-                p,
-                padding="max_length",
-                max_length=audioldm_pipe.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_inputs = text_inputs["input_ids"].to(torch_device)
-            text_embeds = audioldm_pipe.text_encoder(
-                text_inputs,
-            )
-            text_embeds = text_embeds.text_embeds
-            # additional L_2 normalization over each hidden-state
-            text_embeds = F.normalize(text_embeds, dim=-1)
-            embeds.append(text_embeds)
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-    def test_audioldm_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "egg cracking"
-        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 256
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
-        )
-        assert np.abs(audio_slice - expected_slice).max() < 1e-2
-    def test_audioldm_num_waveforms_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        prompt = "A hammer hitting a wooden surface"
-        # test num_waveforms_per_prompt=1 (default)
-        audios = audioldm_pipe(prompt, num_inference_steps=2).audios
-        assert audios.shape == (1, 256)
-        # test num_waveforms_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
-        assert audios.shape == (batch_size, 256)
-        # test num_waveforms_per_prompt for single prompt
-        num_waveforms_per_prompt = 2
-        audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
-        assert audios.shape == (num_waveforms_per_prompt, 256)
-        # test num_waveforms_per_prompt for batch of prompts
-        batch_size = 2
-        audios = audioldm_pipe(
-            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
-        ).audios
-        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
-    def test_audioldm_audio_length_in_s(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.016
-        output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.032
-    def test_audioldm_vocoder_model_in_dim(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        prompt = ["hey"]
-        output = audioldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        assert audio_shape == (1, 256)
-        config = audioldm_pipe.vocoder.config
-        config.model_in_dim *= 2
-        audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
-        output = audioldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
-        assert audio_shape == (1, 256)
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical()
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
-@nightly
-class AudioLDMPipelineSlowTests(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 2.5,
-        }
-        return inputs
-    def test_audioldm(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        audio = audioldm_pipe(**inputs).audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 81920
-        audio_slice = audio[77230:77240]
-        expected_slice = np.array(
-            [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
-        )
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-2
-@nightly
-class AudioLDMPipelineNightlyTests(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 2.5,
-        }
-        return inputs
-    def test_audioldm_lms(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs(torch_device)
-        audio = audioldm_pipe(**inputs).audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 81920
-        audio_slice = audio[27780:27790]
-        expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 3e-2
--- a/tests/pipelines/blipdiffusion/__init__.py
+++ b/tests/pipelines/blipdiffusion/__init__.py
--- a/tests/pipelines/blipdiffusion/test_blipdiffusion.py
+++ b/tests/pipelines/blipdiffusion/test_blipdiffusion.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTokenizer
-from transformers.models.blip_2.configuration_blip_2 import Blip2Config
-from transformers.models.clip.configuration_clip import CLIPTextConfig
-from diffusers import AutoencoderKL, BlipDiffusionPipeline, PNDMScheduler, UNet2DConditionModel
-from diffusers.utils.testing_utils import enable_full_determinism
-from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
-from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
-from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = BlipDiffusionPipeline
-    params = [
-        "prompt",
-        "reference_image",
-        "source_subject_category",
-        "target_subject_category",
-    ]
-    batch_params = [
-        "prompt",
-        "reference_image",
-        "source_subject_category",
-        "target_subject_category",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "num_inference_steps",
-        "neg_prompt",
-        "guidance_scale",
-        "prompt_strength",
-        "prompt_reps",
-    ]
-    supports_dduf = False
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            vocab_size=1000,
-            hidden_size=8,
-            intermediate_size=8,
-            projection_dim=8,
-            num_hidden_layers=1,
-            num_attention_heads=1,
-            max_position_embeddings=77,
-        )
-        text_encoder = ContextCLIPTextModel(text_encoder_config)
-        vae = AutoencoderKL(
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownEncoderBlock2D",),
-            up_block_types=("UpDecoderBlock2D",),
-            block_out_channels=(8,),
-            norm_num_groups=8,
-            layers_per_block=1,
-            act_fn="silu",
-            latent_channels=4,
-            sample_size=8,
-        )
-        blip_vision_config = {
-            "hidden_size": 8,
-            "intermediate_size": 8,
-            "num_hidden_layers": 1,
-            "num_attention_heads": 1,
-            "image_size": 224,
-            "patch_size": 14,
-            "hidden_act": "quick_gelu",
-        }
-        blip_qformer_config = {
-            "vocab_size": 1000,
-            "hidden_size": 8,
-            "num_hidden_layers": 1,
-            "num_attention_heads": 1,
-            "intermediate_size": 8,
-            "max_position_embeddings": 512,
-            "cross_attention_frequency": 1,
-            "encoder_hidden_size": 8,
-        }
-        qformer_config = Blip2Config(
-            vision_config=blip_vision_config,
-            qformer_config=blip_qformer_config,
-            num_query_tokens=8,
-            tokenizer="hf-internal-testing/tiny-random-bert",
-        )
-        qformer = Blip2QFormerModel(qformer_config)
-        unet = UNet2DConditionModel(
-            block_out_channels=(8, 16),
-            norm_num_groups=8,
-            layers_per_block=1,
-            sample_size=16,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=8,
-        )
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        scheduler = PNDMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            set_alpha_to_one=False,
-            skip_prk_steps=True,
-        )
-        vae.eval()
-        qformer.eval()
-        text_encoder.eval()
-        image_processor = BlipImageProcessor()
-        components = {
-            "text_encoder": text_encoder,
-            "vae": vae,
-            "qformer": qformer,
-            "unet": unet,
-            "tokenizer": tokenizer,
-            "scheduler": scheduler,
-            "image_processor": image_processor,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        np.random.seed(seed)
-        reference_image = np.random.rand(32, 32, 3) * 255
-        reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "swimming underwater",
-            "generator": generator,
-            "reference_image": reference_image,
-            "source_subject_category": "dog",
-            "target_subject_category": "dog",
-            "height": 32,
-            "width": 32,
-            "guidance_scale": 7.5,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-    def test_blipdiffusion(self):
-        device = "cpu"
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        image = pipe(**self.get_dummy_inputs(device))[0]
-        image_slice = image[0, -3:, -3:, 0]
-        assert image.shape == (1, 16, 16, 4)
-        expected_slice = np.array(
-            [0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
-            f" expected_slice {image_slice.flatten()}, but got {image_slice.flatten()}"
-        )
-    @unittest.skip("Test not supported because of complexities in deriving query_embeds.")
-    def test_encode_prompt_works_in_isolation(self):
-        pass
--- a/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py
+++ b/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTokenizer
-from transformers.models.blip_2.configuration_blip_2 import Blip2Config
-from transformers.models.clip.configuration_clip import CLIPTextConfig
-from diffusers import (
-    AutoencoderKL,
-    BlipDiffusionControlNetPipeline,
-    ControlNetModel,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.utils.testing_utils import enable_full_determinism, torch_device
-from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
-from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
-from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class BlipDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = BlipDiffusionControlNetPipeline
-    params = [
-        "prompt",
-        "reference_image",
-        "source_subject_category",
-        "target_subject_category",
-        "condtioning_image",
-    ]
-    batch_params = [
-        "prompt",
-        "reference_image",
-        "source_subject_category",
-        "target_subject_category",
-        "condtioning_image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "num_inference_steps",
-        "neg_prompt",
-        "guidance_scale",
-        "prompt_strength",
-        "prompt_reps",
-    ]
-    supports_dduf = False
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            vocab_size=1000,
-            hidden_size=16,
-            intermediate_size=16,
-            projection_dim=16,
-            num_hidden_layers=1,
-            num_attention_heads=1,
-            max_position_embeddings=77,
-        )
-        text_encoder = ContextCLIPTextModel(text_encoder_config)
-        vae = AutoencoderKL(
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownEncoderBlock2D",),
-            up_block_types=("UpDecoderBlock2D",),
-            block_out_channels=(32,),
-            layers_per_block=1,
-            act_fn="silu",
-            latent_channels=4,
-            norm_num_groups=16,
-            sample_size=16,
-        )
-        blip_vision_config = {
-            "hidden_size": 16,
-            "intermediate_size": 16,
-            "num_hidden_layers": 1,
-            "num_attention_heads": 1,
-            "image_size": 224,
-            "patch_size": 14,
-            "hidden_act": "quick_gelu",
-        }
-        blip_qformer_config = {
-            "vocab_size": 1000,
-            "hidden_size": 16,
-            "num_hidden_layers": 1,
-            "num_attention_heads": 1,
-            "intermediate_size": 16,
-            "max_position_embeddings": 512,
-            "cross_attention_frequency": 1,
-            "encoder_hidden_size": 16,
-        }
-        qformer_config = Blip2Config(
-            vision_config=blip_vision_config,
-            qformer_config=blip_qformer_config,
-            num_query_tokens=16,
-            tokenizer="hf-internal-testing/tiny-random-bert",
-        )
-        qformer = Blip2QFormerModel(qformer_config)
-        unet = UNet2DConditionModel(
-            block_out_channels=(4, 16),
-            layers_per_block=1,
-            norm_num_groups=4,
-            sample_size=16,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=16,
-        )
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        scheduler = PNDMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            set_alpha_to_one=False,
-            skip_prk_steps=True,
-        )
-        controlnet = ControlNetModel(
-            block_out_channels=(4, 16),
-            layers_per_block=1,
-            in_channels=4,
-            norm_num_groups=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=16,
-            conditioning_embedding_out_channels=(8, 16),
-        )
-        vae.eval()
-        qformer.eval()
-        text_encoder.eval()
-        image_processor = BlipImageProcessor()
-        components = {
-            "text_encoder": text_encoder,
-            "vae": vae,
-            "qformer": qformer,
-            "unet": unet,
-            "tokenizer": tokenizer,
-            "scheduler": scheduler,
-            "controlnet": controlnet,
-            "image_processor": image_processor,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        np.random.seed(seed)
-        reference_image = np.random.rand(32, 32, 3) * 255
-        reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
-        cond_image = np.random.rand(32, 32, 3) * 255
-        cond_image = Image.fromarray(cond_image.astype("uint8")).convert("RGBA")
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "swimming underwater",
-            "generator": generator,
-            "reference_image": reference_image,
-            "condtioning_image": cond_image,
-            "source_subject_category": "dog",
-            "target_subject_category": "dog",
-            "height": 32,
-            "width": 32,
-            "guidance_scale": 7.5,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-    def test_dict_tuple_outputs_equivalent(self):
-        expected_slice = None
-        if torch_device == "cpu":
-            expected_slice = np.array([0.4803, 0.3865, 0.1422, 0.6119, 0.2283, 0.6365, 0.5453, 0.5205, 0.3581])
-        super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
-    def test_blipdiffusion_controlnet(self):
-        device = "cpu"
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        image = pipe(**self.get_dummy_inputs(device))[0]
-        image_slice = image[0, -3:, -3:, 0]
-        assert image.shape == (1, 16, 16, 4)
-        expected_slice = np.array([0.7953, 0.7136, 0.6597, 0.4779, 0.7389, 0.4111, 0.5826, 0.4150, 0.8422])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
-            f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        )
-    @unittest.skip("Test not supported because of complexities in deriving query_embeds.")
-    def test_encode_prompt_works_in_isolation(self):
-        pass
--- a/tests/pipelines/controlnet_xs/__init__.py
+++ b/tests/pipelines/controlnet_xs/__init__.py
--- a/tests/pipelines/controlnet_xs/test_controlnetxs.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import unittest
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from diffusers import (
-    AsymmetricAutoencoderKL,
-    AutoencoderKL,
-    AutoencoderTiny,
-    ConsistencyDecoderVAE,
-    ControlNetXSAdapter,
-    DDIMScheduler,
-    LCMScheduler,
-    StableDiffusionControlNetXSPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    load_image,
-    require_accelerator,
-    require_torch_accelerator,
-    slow,
-    torch_device,
-)
-from diffusers.utils.torch_utils import randn_tensor
-from ...models.autoencoders.vae import (
-    get_asym_autoencoder_kl_config,
-    get_autoencoder_kl_config,
-    get_autoencoder_tiny_config,
-    get_consistency_vae_config,
-)
-from ..pipeline_params import (
-    IMAGE_TO_IMAGE_IMAGE_PARAMS,
-    TEXT_TO_IMAGE_BATCH_PARAMS,
-    TEXT_TO_IMAGE_IMAGE_PARAMS,
-    TEXT_TO_IMAGE_PARAMS,
-)
-from ..test_pipelines_common import (
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-    SDFunctionTesterMixin,
-)
-enable_full_determinism()
-def to_np(tensor):
-    if isinstance(tensor, torch.Tensor):
-        tensor = tensor.detach().cpu().numpy()
-    return tensor
-class ControlNetXSPipelineFastTests(
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    SDFunctionTesterMixin,
-    unittest.TestCase,
-):
-    pipeline_class = StableDiffusionControlNetXSPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    test_attention_slicing = False
-    test_layerwise_casting = True
-    test_group_offloading = True
-    def get_dummy_components(self, time_cond_proj_dim=None):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            sample_size=16,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=8,
-            norm_num_groups=4,
-            time_cond_proj_dim=time_cond_proj_dim,
-            use_linear_projection=True,
-        )
-        torch.manual_seed(0)
-        controlnet = ControlNetXSAdapter.from_unet(
-            unet=unet,
-            size_ratio=1,
-            learn_time_embedding=True,
-            conditioning_embedding_out_channels=(2, 2),
-        )
-        torch.manual_seed(0)
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[4, 8],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=8,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        controlnet_embedder_scale_factor = 2
-        image = randn_tensor(
-            (1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
-            generator=generator,
-            device=torch.device(device),
-        )
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-            "image": image,
-        }
-        return inputs
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
-    def test_controlnet_lcm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components(time_cond_proj_dim=8)
-        sd_pipe = StableDiffusionControlNetXSPipeline(**components)
-        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 16, 16, 3)
-        expected_slice = np.array([0.745, 0.753, 0.767, 0.543, 0.523, 0.502, 0.314, 0.521, 0.478])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
-        pipe.to(dtype=torch.float16)
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
-    def test_multi_vae(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        block_out_channels = pipe.vae.config.block_out_channels
-        norm_num_groups = pipe.vae.config.norm_num_groups
-        vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
-        configs = [
-            get_autoencoder_kl_config(block_out_channels, norm_num_groups),
-            get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
-            get_consistency_vae_config(block_out_channels, norm_num_groups),
-            get_autoencoder_tiny_config(block_out_channels),
-        ]
-        out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
-        for vae_cls, config in zip(vae_classes, configs):
-            vae = vae_cls(**config)
-            vae = vae.to(torch_device)
-            components["vae"] = vae
-            vae_pipe = self.pipeline_class(**components)
-            # pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
-            # So we need to move the new pipe to device.
-            vae_pipe.to(torch_device)
-            vae_pipe.set_progress_bar_config(disable=None)
-            out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
-            assert out_vae_np.shape == out_np.shape
-    @require_accelerator
-    def test_to_device(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to("cpu")
-        # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the device from pipe.components
-        model_devices = [
-            component.device.type for component in pipe.components.values() if hasattr(component, "device")
-        ]
-        self.assertTrue(all(device == "cpu" for device in model_devices))
-        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
-        self.assertTrue(np.isnan(output_cpu).sum() == 0)
-        pipe.to(torch_device)
-        model_devices = [
-            component.device.type for component in pipe.components.values() if hasattr(component, "device")
-        ]
-        self.assertTrue(all(device == torch_device for device in model_devices))
-        output_device = pipe(**self.get_dummy_inputs(torch_device))[0]
-        self.assertTrue(np.isnan(to_np(output_device)).sum() == 0)
-    def test_encode_prompt_works_in_isolation(self):
-        extra_required_param_value_dict = {
-            "device": torch.device(torch_device).type,
-            "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
-        }
-        return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
-@slow
-@require_torch_accelerator
-class ControlNetXSPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def test_canny(self):
-        controlnet = ControlNetXSAdapter.from_pretrained(
-            "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
-        )
-        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
-        )
-        pipe.enable_model_cpu_offload(device=torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "bird"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        )
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-        image = output.images[0]
-        assert image.shape == (768, 512, 3)
-        original_image = image[-3:, -3:, -1].flatten()
-        expected_image = np.array([0.1963, 0.229, 0.2659, 0.2109, 0.2332, 0.2827, 0.2534, 0.2422, 0.2808])
-        assert np.allclose(original_image, expected_image, atol=1e-04)
-    def test_depth(self):
-        controlnet = ControlNetXSAdapter.from_pretrained(
-            "UmerHA/Testing-ConrolNetXS-SD2.1-depth", torch_dtype=torch.float16
-        )
-        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
-        )
-        pipe.enable_model_cpu_offload(device=torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "Stormtrooper's lecture"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
-        )
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-        image = output.images[0]
-        assert image.shape == (512, 512, 3)
-        original_image = image[-3:, -3:, -1].flatten()
-        expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
-        assert np.allclose(original_image, expected_image, atol=1e-04)
--- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import unittest
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
-from diffusers import (
-    AsymmetricAutoencoderKL,
-    AutoencoderKL,
-    AutoencoderTiny,
-    ConsistencyDecoderVAE,
-    ControlNetXSAdapter,
-    EulerDiscreteScheduler,
-    StableDiffusionXLControlNetXSPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    load_image,
-    require_torch_accelerator,
-    slow,
-    torch_device,
-)
-from diffusers.utils.torch_utils import randn_tensor
-from ...models.autoencoders.vae import (
-    get_asym_autoencoder_kl_config,
-    get_autoencoder_kl_config,
-    get_autoencoder_tiny_config,
-    get_consistency_vae_config,
-)
-from ..pipeline_params import (
-    IMAGE_TO_IMAGE_IMAGE_PARAMS,
-    TEXT_TO_IMAGE_BATCH_PARAMS,
-    TEXT_TO_IMAGE_IMAGE_PARAMS,
-    TEXT_TO_IMAGE_PARAMS,
-)
-from ..test_pipelines_common import (
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-)
-enable_full_determinism()
-class StableDiffusionXLControlNetXSPipelineFastTests(
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    unittest.TestCase,
-):
-    pipeline_class = StableDiffusionXLControlNetXSPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    test_attention_slicing = False
-    test_layerwise_casting = True
-    test_group_offloading = True
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            sample_size=16,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            use_linear_projection=True,
-            norm_num_groups=4,
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            addition_embed_type="text_time",
-            addition_time_embed_dim=8,
-            transformer_layers_per_block=(1, 2),
-            projection_class_embeddings_input_dim=56,  # 6 * 8 (addition_time_embed_dim) + 8 (cross_attention_dim)
-            cross_attention_dim=8,
-        )
-        torch.manual_seed(0)
-        controlnet = ControlNetXSAdapter.from_unet(
-            unet=unet,
-            size_ratio=0.5,
-            learn_time_embedding=True,
-            conditioning_embedding_out_channels=(2, 2),
-        )
-        torch.manual_seed(0)
-        scheduler = EulerDiscreteScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            steps_offset=1,
-            beta_schedule="scaled_linear",
-            timestep_spacing="leading",
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[4, 8],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=4,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=8,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
-        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "text_encoder_2": text_encoder_2,
-            "tokenizer_2": tokenizer_2,
-            "feature_extractor": None,
-        }
-        return components
-    # Copied from test_controlnet_sdxl.py
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        controlnet_embedder_scale_factor = 2
-        image = randn_tensor(
-            (1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
-            generator=generator,
-            device=torch.device(device),
-        )
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "np",
-            "image": image,
-        }
-        return inputs
-    # Copied from test_controlnet_sdxl.py
-    def test_attention_slicing_forward_pass(self):
-        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    # Copied from test_controlnet_sdxl.py
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
-    # Copied from test_controlnet_sdxl.py
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
-    @unittest.skip("We test this functionality elsewhere already.")
-    def test_save_load_optional_components(self):
-        pass
-    @require_torch_accelerator
-    # Copied from test_controlnet_sdxl.py
-    def test_stable_diffusion_xl_offloads(self):
-        pipes = []
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components).to(torch_device)
-        pipes.append(sd_pipe)
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload(device=torch_device)
-        pipes.append(sd_pipe)
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
-        pipes.append(sd_pipe)
-        image_slices = []
-        for pipe in pipes:
-            pipe.unet.set_default_attn_processor()
-            inputs = self.get_dummy_inputs(torch_device)
-            image = pipe(**inputs).images
-            image_slices.append(image[0, -3:, -3:, -1].flatten())
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
-        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
-    # Copied from test_controlnet_sdxl.py
-    def test_stable_diffusion_xl_multi_prompts(self):
-        components = self.get_dummy_components()
-        sd_pipe = self.pipeline_class(**components).to(torch_device)
-        # forward with single prompt
-        inputs = self.get_dummy_inputs(torch_device)
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-        # forward with same prompt duplicated
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt_2"] = inputs["prompt"]
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-        # ensure the results are equal
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-        # forward with different prompt
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt_2"] = "different prompt"
-        output = sd_pipe(**inputs)
-        image_slice_3 = output.images[0, -3:, -3:, -1]
-        # ensure the results are not equal
-        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
-        # manually set a negative_prompt
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["negative_prompt"] = "negative prompt"
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-        # forward with same negative_prompt duplicated
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["negative_prompt"] = "negative prompt"
-        inputs["negative_prompt_2"] = inputs["negative_prompt"]
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-        # ensure the results are equal
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-        # forward with different negative_prompt
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["negative_prompt"] = "negative prompt"
-        inputs["negative_prompt_2"] = "different negative prompt"
-        output = sd_pipe(**inputs)
-        image_slice_3 = output.images[0, -3:, -3:, -1]
-        # ensure the results are not equal
-        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
-    # Copied from test_controlnetxs.py
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
-        pipe.to(dtype=torch.float16)
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
-    def test_multi_vae(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        block_out_channels = pipe.vae.config.block_out_channels
-        norm_num_groups = pipe.vae.config.norm_num_groups
-        vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
-        configs = [
-            get_autoencoder_kl_config(block_out_channels, norm_num_groups),
-            get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
-            get_consistency_vae_config(block_out_channels, norm_num_groups),
-            get_autoencoder_tiny_config(block_out_channels),
-        ]
-        out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
-        for vae_cls, config in zip(vae_classes, configs):
-            vae = vae_cls(**config)
-            vae = vae.to(torch_device)
-            components["vae"] = vae
-            vae_pipe = self.pipeline_class(**components)
-            # pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
-            # So we need to move the new pipe to device.
-            vae_pipe.to(torch_device)
-            vae_pipe.set_progress_bar_config(disable=None)
-            out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
-            assert out_vae_np.shape == out_np.shape
-@slow
-@require_torch_accelerator
-class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def test_canny(self):
-        controlnet = ControlNetXSAdapter.from_pretrained(
-            "UmerHA/Testing-ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
-        )
-        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
-        )
-        pipe.enable_sequential_cpu_offload(device=torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "bird"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        )
-        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
-        assert images[0].shape == (768, 512, 3)
-        original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array([0.3202, 0.3151, 0.3328, 0.3172, 0.337, 0.3381, 0.3378, 0.3389, 0.3224])
-        assert np.allclose(original_image, expected_image, atol=1e-04)
-    def test_depth(self):
-        controlnet = ControlNetXSAdapter.from_pretrained(
-            "UmerHA/Testing-ConrolNetXS-SDXL-depth", torch_dtype=torch.float16
-        )
-        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
-        )
-        pipe.enable_sequential_cpu_offload(device=torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "Stormtrooper's lecture"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
-        )
-        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
-        assert images[0].shape == (512, 512, 3)
-        original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array([0.5448, 0.5437, 0.5426, 0.5543, 0.553, 0.5475, 0.5595, 0.5602, 0.5529])
-        assert np.allclose(original_image, expected_image, atol=1e-04)
--- a/tests/pipelines/dance_diffusion/__init__.py
+++ b/tests/pipelines/dance_diffusion/__init__.py
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import unittest
-import numpy as np
-import torch
-from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    nightly,
-    require_torch_accelerator,
-    skip_mps,
-    torch_device,
-)
-from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = DanceDiffusionPipeline
-    params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "callback",
-        "latents",
-        "callback_steps",
-        "output_type",
-        "num_images_per_prompt",
-    }
-    batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
-    test_attention_slicing = False
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet1DModel(
-            block_out_channels=(32, 32, 64),
-            extra_in_channels=16,
-            sample_size=512,
-            sample_rate=16_000,
-            in_channels=2,
-            out_channels=2,
-            flip_sin_to_cos=True,
-            use_timestep_embedding=False,
-            time_embedding_type="fourier",
-            mid_block_type="UNetMidBlock1D",
-            down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
-            up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
-        )
-        scheduler = IPNDMScheduler()
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "batch_size": 1,
-            "generator": generator,
-            "num_inference_steps": 4,
-        }
-        return inputs
-    def test_dance_diffusion(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = DanceDiffusionPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        output = pipe(**inputs)
-        audio = output.audios
-        audio_slice = audio[0, -3:, -3:]
-        assert audio.shape == (1, 2, components["unet"].sample_size)
-        expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
-@nightly
-@require_torch_accelerator
-class PipelineIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        # clean up the VRAM before each test
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def test_dance_diffusion(self):
-        device = torch_device
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
-        audio = output.audios
-        audio_slice = audio[0, -3:, -3:]
-        assert audio.shape == (1, 2, pipe.unet.config.sample_size)
-        expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020])
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
-    def test_dance_diffusion_fp16(self):
-        device = torch_device
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
-        audio = output.audios
-        audio_slice = audio[0, -3:, -3:]
-        assert audio.shape == (1, 2, pipe.unet.config.sample_size)
-        expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341])
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
--- a/tests/pipelines/i2vgen_xl/__init__.py
+++ b/tests/pipelines/i2vgen_xl/__init__.py
--- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
+++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import random
-import unittest
-import numpy as np
-import pytest
-import torch
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    I2VGenXLPipeline,
-)
-from diffusers.models.unets import I2VGenXLUNet
-from diffusers.utils import is_xformers_available, load_image
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    floats_tensor,
-    is_torch_version,
-    numpy_cosine_similarity_distance,
-    require_torch_accelerator,
-    skip_mps,
-    slow,
-    torch_device,
-)
-from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
-enable_full_determinism()
-@skip_mps
-class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = I2VGenXLPipeline
-    params = frozenset(["prompt", "negative_prompt", "image"])
-    batch_params = frozenset(["prompt", "negative_prompt", "image", "generator"])
-    # No `output_type`.
-    required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"])
-    supports_dduf = False
-    test_layerwise_casting = True
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        unet = I2VGenXLUNet(
-            block_out_channels=(4, 8),
-            layers_per_block=1,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
-            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
-            cross_attention_dim=4,
-            attention_head_dim=4,
-            num_attention_heads=None,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=(8,),
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=32,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=4,
-            intermediate_size=16,
-            layer_norm_eps=1e-05,
-            num_attention_heads=2,
-            num_hidden_layers=2,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        torch.manual_seed(0)
-        vision_encoder_config = CLIPVisionConfig(
-            hidden_size=4,
-            projection_dim=4,
-            num_hidden_layers=2,
-            num_attention_heads=2,
-            image_size=32,
-            intermediate_size=16,
-            patch_size=1,
-        )
-        image_encoder = CLIPVisionModelWithProjection(vision_encoder_config)
-        torch.manual_seed(0)
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "image_encoder": image_encoder,
-            "tokenizer": tokenizer,
-            "feature_extractor": feature_extractor,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": input_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "pt",
-            "num_frames": 4,
-            "width": 32,
-            "height": 32,
-        }
-        return inputs
-    def test_text_to_video_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        inputs["output_type"] = "np"
-        frames = pipe(**inputs).frames
-        image_slice = frames[0][0][-3:, -3:, -1]
-        assert frames[0][0].shape == (32, 32, 3)
-        expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-    @pytest.mark.xfail(
-        condition=is_torch_version(">=", "2.7"),
-        reason="Test currently fails on PyTorch 2.7.",
-        strict=False,
-    )
-    def test_save_load_local(self):
-        super().test_save_load_local(expected_max_difference=0.006)
-    def test_sequential_cpu_offload_forward_pass(self):
-        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=0.008)
-    def test_dict_tuple_outputs_equivalent(self):
-        super().test_dict_tuple_outputs_equivalent(expected_max_difference=0.009)
-    def test_save_load_optional_components(self):
-        super().test_save_load_optional_components(expected_max_difference=0.008)
-    @unittest.skip("Deprecated functionality")
-    def test_attention_slicing_forward_pass(self):
-        pass
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=0.008)
-    def test_model_cpu_offload_forward_pass(self):
-        super().test_model_cpu_offload_forward_pass(expected_max_diff=0.008)
-    def test_num_videos_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        inputs["output_type"] = "np"
-        frames = pipe(**inputs, num_videos_per_prompt=2).frames
-        assert frames.shape == (2, 4, 32, 32, 3)
-        assert frames[0][0].shape == (32, 32, 3)
-        image_slice = frames[0][0][-3:, -3:, -1]
-        expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-    @unittest.skip("Test not supported for now.")
-    def test_encode_prompt_works_in_isolation(self):
-        pass
-@slow
-@require_torch_accelerator
-class I2VGenXLPipelineSlowTests(unittest.TestCase):
-    def setUp(self):
-        # clean up the VRAM before each test
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def test_i2vgen_xl(self):
-        pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
-        pipe.enable_model_cpu_offload(device=torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
-        )
-        generator = torch.Generator("cpu").manual_seed(0)
-        num_frames = 3
-        output = pipe(
-            image=image,
-            prompt="my cat",
-            num_frames=num_frames,
-            generator=generator,
-            num_inference_steps=3,
-            output_type="np",
-        )
-        image = output.frames[0]
-        assert image.shape == (num_frames, 704, 1280, 3)
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.5482, 0.6244, 0.6274, 0.4584, 0.5935, 0.5937, 0.4579, 0.5767, 0.5892])
-        assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3
--- a/tests/pipelines/musicldm/__init__.py
+++ b/tests/pipelines/musicldm/__init__.py
--- a/tests/pipelines/musicldm/test_musicldm.py
+++ b/tests/pipelines/musicldm/test_musicldm.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import unittest
-import numpy as np
-import torch
-from transformers import (
-    ClapAudioConfig,
-    ClapConfig,
-    ClapFeatureExtractor,
-    ClapModel,
-    ClapTextConfig,
-    RobertaTokenizer,
-    SpeechT5HifiGan,
-    SpeechT5HifiGanConfig,
-)
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    MusicLDMPipeline,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.utils import is_xformers_available
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    nightly,
-    require_torch_accelerator,
-    torch_device,
-)
-from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = MusicLDMPipeline
-    params = TEXT_TO_AUDIO_PARAMS
-    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_waveforms_per_prompt",
-            "generator",
-            "latents",
-            "output_type",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-    supports_dduf = False
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=(32, 64),
-            class_embed_type="simple_projection",
-            projection_class_embeddings_input_dim=32,
-            class_embeddings_concat=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=1,
-            out_channels=1,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_branch_config = ClapTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=16,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=2,
-            num_hidden_layers=2,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        audio_branch_config = ClapAudioConfig(
-            spec_size=64,
-            window_size=4,
-            num_mel_bins=64,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            depths=[2, 2],
-            num_attention_heads=[2, 2],
-            num_hidden_layers=2,
-            hidden_size=192,
-            patch_size=2,
-            patch_stride=2,
-            patch_embed_input_channels=4,
-        )
-        text_encoder_config = ClapConfig.from_text_audio_configs(
-            text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=32
-        )
-        text_encoder = ClapModel(text_encoder_config)
-        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
-        feature_extractor = ClapFeatureExtractor.from_pretrained(
-            "hf-internal-testing/tiny-random-ClapModel", hop_length=7900
-        )
-        torch.manual_seed(0)
-        vocoder_config = SpeechT5HifiGanConfig(
-            model_in_dim=8,
-            sampling_rate=16000,
-            upsample_initial_channel=16,
-            upsample_rates=[2, 2],
-            upsample_kernel_sizes=[4, 4],
-            resblock_kernel_sizes=[3, 7],
-            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
-            normalize_before=False,
-        )
-        vocoder = SpeechT5HifiGan(vocoder_config)
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "feature_extractor": feature_extractor,
-            "vocoder": vocoder,
-        }
-        return components
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-        }
-        return inputs
-    def test_musicldm_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        musicldm_pipe = MusicLDMPipeline(**components)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        output = musicldm_pipe(**inputs)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 256
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0027, -0.0036, -0.0037, -0.0020, -0.0035, -0.0019, -0.0037, -0.0020, -0.0038, -0.0019]
-        )
-        assert np.abs(audio_slice - expected_slice).max() < 1e-4
-    def test_musicldm_prompt_embeds(self):
-        components = self.get_dummy_components()
-        musicldm_pipe = MusicLDMPipeline(**components)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-        # forward
-        output = musicldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-        text_inputs = musicldm_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=musicldm_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_inputs = text_inputs["input_ids"].to(torch_device)
-        prompt_embeds = musicldm_pipe.text_encoder.get_text_features(text_inputs)
-        inputs["prompt_embeds"] = prompt_embeds
-        # forward
-        output = musicldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-    def test_musicldm_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        musicldm_pipe = MusicLDMPipeline(**components)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-        # forward
-        output = musicldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = musicldm_pipe.tokenizer(
-                p,
-                padding="max_length",
-                max_length=musicldm_pipe.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_inputs = text_inputs["input_ids"].to(torch_device)
-            text_embeds = musicldm_pipe.text_encoder.get_text_features(
-                text_inputs,
-            )
-            embeds.append(text_embeds)
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-        # forward
-        output = musicldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-    def test_musicldm_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        musicldm_pipe = MusicLDMPipeline(**components)
-        musicldm_pipe = musicldm_pipe.to(device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "egg cracking"
-        output = musicldm_pipe(**inputs, negative_prompt=negative_prompt)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 256
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0027, -0.0036, -0.0037, -0.0019, -0.0035, -0.0018, -0.0037, -0.0021, -0.0038, -0.0018]
-        )
-        assert np.abs(audio_slice - expected_slice).max() < 1e-4
-    def test_musicldm_num_waveforms_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        musicldm_pipe = MusicLDMPipeline(**components)
-        musicldm_pipe = musicldm_pipe.to(device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        prompt = "A hammer hitting a wooden surface"
-        # test num_waveforms_per_prompt=1 (default)
-        audios = musicldm_pipe(prompt, num_inference_steps=2).audios
-        assert audios.shape == (1, 256)
-        # test num_waveforms_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        audios = musicldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
-        assert audios.shape == (batch_size, 256)
-        # test num_waveforms_per_prompt for single prompt
-        num_waveforms_per_prompt = 2
-        audios = musicldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
-        assert audios.shape == (num_waveforms_per_prompt, 256)
-        # test num_waveforms_per_prompt for batch of prompts
-        batch_size = 2
-        audios = musicldm_pipe(
-            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
-        ).audios
-        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
-    def test_musicldm_audio_length_in_s(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        musicldm_pipe = MusicLDMPipeline(**components)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        vocoder_sampling_rate = musicldm_pipe.vocoder.config.sampling_rate
-        inputs = self.get_dummy_inputs(device)
-        output = musicldm_pipe(audio_length_in_s=0.016, **inputs)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.016
-        output = musicldm_pipe(audio_length_in_s=0.032, **inputs)
-        audio = output.audios[0]
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.032
-    def test_musicldm_vocoder_model_in_dim(self):
-        components = self.get_dummy_components()
-        musicldm_pipe = MusicLDMPipeline(**components)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        prompt = ["hey"]
-        output = musicldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        assert audio_shape == (1, 256)
-        config = musicldm_pipe.vocoder.config
-        config.model_in_dim *= 2
-        musicldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
-        output = musicldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
-        assert audio_shape == (1, 256)
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical()
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        # The method component.dtype returns the dtype of the first parameter registered in the model, not the
-        # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
-        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
-        # Without the logit scale parameters, everything is float32
-        model_dtypes.pop("text_encoder")
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
-        # the CLAP sub-models are float32
-        model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
-        # Once we send to fp16, all params are in half-precision, including the logit scale
-        pipe.to(dtype=torch.float16)
-        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
-        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
-@nightly
-@require_torch_accelerator
-class MusicLDMPipelineNightlyTests(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 2.5,
-        }
-        return inputs
-    def test_musicldm(self):
-        musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        audio = musicldm_pipe(**inputs).audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 81952
-        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
-        audio_slice = audio[8680:8690]
-        expected_slice = np.array(
-            [-0.1042, -0.1068, -0.1235, -0.1387, -0.1428, -0.136, -0.1213, -0.1097, -0.0967, -0.0945]
-        )
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-3
-    def test_musicldm_lms(self):
-        musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
-        musicldm_pipe.scheduler = LMSDiscreteScheduler.from_config(musicldm_pipe.scheduler.config)
-        musicldm_pipe = musicldm_pipe.to(torch_device)
-        musicldm_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs(torch_device)
-        audio = musicldm_pipe(**inputs).audios[0]
-        assert audio.ndim == 1
-        assert len(audio) == 81952
-        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
-        audio_slice = audio[58020:58030]
-        expected_slice = np.array([0.3592, 0.3477, 0.4084, 0.4665, 0.5048, 0.5891, 0.6461, 0.5579, 0.4595, 0.4403])
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-3
--- a/tests/pipelines/paint_by_example/__init__.py
+++ b/tests/pipelines/paint_by_example/__init__.py
--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import gc
-import random
-import unittest
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPImageProcessor, CLIPVisionConfig
-from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
-from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    floats_tensor,
-    load_image,
-    nightly,
-    require_torch_accelerator,
-    torch_device,
-)
-from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-enable_full_determinism()
-class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = PaintByExamplePipeline
-    params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
-    batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-    image_params = frozenset([])  # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess
-    supports_dduf = False
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            image_size=32,
-            patch_size=4,
-        )
-        image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "image_encoder": image_encoder,
-            "safety_checker": None,
-            "feature_extractor": feature_extractor,
-        }
-        return components
-    def convert_to_pt(self, image):
-        image = np.array(image.convert("RGB"))
-        image = image[None].transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-        return image
-    def get_dummy_inputs(self, device="cpu", seed=0):
-        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
-        example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "example_image": example_image,
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "np",
-        }
-        return inputs
-    def test_paint_by_example_inpaint(self):
-        components = self.get_dummy_components()
-        # make sure here that pndm scheduler skips prk
-        pipe = PaintByExamplePipeline(**components)
-        pipe = pipe.to("cpu")
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4686, 0.5687, 0.4007, 0.5218, 0.5741, 0.4482, 0.4940, 0.4629, 0.4503])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-    def test_paint_by_example_image_tensor(self):
-        device = "cpu"
-        inputs = self.get_dummy_inputs()
-        inputs.pop("mask_image")
-        image = self.convert_to_pt(inputs.pop("image"))
-        mask_image = image.clamp(0, 1) / 2
-        # make sure here that pndm scheduler skips prk
-        pipe = PaintByExamplePipeline(**self.get_dummy_components())
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        output = pipe(image=image, mask_image=mask_image[:, 0], **inputs)
-        out_1 = output.images
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB")
-        output = pipe(**self.get_dummy_inputs())
-        out_2 = output.images
-        assert out_1.shape == (1, 64, 64, 3)
-        assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
-@nightly
-@require_torch_accelerator
-class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        # clean up the VRAM before each test
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-    def test_paint_by_example(self):
-        # make sure here that pndm scheduler skips prk
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/paint_by_example/dog_in_bucket.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/paint_by_example/mask.png"
-        )
-        example_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/paint_by_example/panda.jpg"
-        )
-        pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.manual_seed(321)
-        output = pipe(
-            image=init_image,
-            mask_image=mask_image,
-            example_image=example_image,
-            generator=generator,
-            guidance_scale=5.0,
-            num_inference_steps=50,
-            output_type="np",
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2