enable pipeline test cases on xpu (#11527)

* enable several pipeline integration tests on xpu Signed-off-by: Yao Matrix <matrix.yao@intel.com> * fix style Signed-off-by: Yao Matrix <matrix.yao@intel.com> * update per comments Signed-off-by: Matrix Yao <matrix.yao@intel.com> --------- Signed-off-by: Yao Matrix <matrix.yao@intel.com> Signed-off-by: Matrix Yao <matrix.yao@intel.com>

enable pipeline test cases on xpu (#11527)
* enable several pipeline integration tests on xpu Signed-off-by: Yao Matrix <matrix.yao@intel.com> * fix style Signed-off-by: Yao Matrix <matrix.yao@intel.com> * update per comments Signed-off-by: Matrix Yao <matrix.yao@intel.com> --------- Signed-off-by: Yao Matrix <matrix.yao@intel.com> Signed-off-by: Matrix Yao <matrix.yao@intel.com>
049082e0 · Yao Matrix · GitHub · f161e277 · 049082e0 · 049082e0
Unverified Commit 049082e0 authored May 26, 2025 by Yao Matrix Committed by GitHub May 26, 2025
11 changed files
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -635,10 +635,10 @@ def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -
    return arry
-def load_pt(url: str, map_location: str):
+def load_pt(url: str, map_location: Optional[str] = None, weights_only: Optional[bool] = True):
    response = requests.get(url, timeout=DIFFUSERS_REQUEST_TIMEOUT)
    response.raise_for_status()
-    arry = torch.load(BytesIO(response.content), map_location=map_location)
+    arry = torch.load(BytesIO(response.content), map_location=map_location, weights_only=weights_only)
    return arry

--- a/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
+++ b/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
@@ -304,7 +304,8 @@ class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase):
        generator = torch.Generator(device="cpu").manual_seed(0)
        image_embedding = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/image_embedding.pt"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/image_embedding.pt",
+            map_location=torch_device,
        )
        image = pipe(
@@ -320,4 +321,4 @@ class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase):
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/stable_cascade_decoder_image.npy"
        )
        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
-        assert max_diff < 1e-4
+        assert max_diff < 2e-4
--- a/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py
+++ b/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py
@@ -20,26 +20,32 @@ import numpy as np
 import torch
 from diffusers import StableDiffusionKDiffusionPipeline
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    nightly,
+    require_torch_accelerator,
+    torch_device,
+)
 enable_full_determinism()
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_stable_diffusion_1(self):
        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")

--- a/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py
+++ b/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py
@@ -28,7 +28,13 @@ from diffusers import (
    StableDiffusionLDM3DPipeline,
    UNet2DConditionModel,
 )
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    nightly,
+    require_torch_accelerator,
+    torch_device,
+)
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -205,17 +211,17 @@ class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase):
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -256,17 +262,17 @@ class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase):
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)

--- a/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py
@@ -29,7 +29,13 @@ from diffusers import (
    StableDiffusionSAGPipeline,
    UNet2DConditionModel,
 )
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    nightly,
+    require_torch_accelerator,
+    torch_device,
+)
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import (
@@ -162,19 +168,19 @@ class StableDiffusionSAGPipelineFastTests(
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_stable_diffusion_1(self):
        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")

--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -13,7 +13,17 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
+    enable_full_determinism,
+    load_numpy,
+    nightly,
+    require_torch_accelerator,
+    torch_device,
+)
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import (
@@ -190,19 +200,19 @@ class StableUnCLIPPipelineFastTests(
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_stable_unclip(self):
        expected_image = load_numpy(
@@ -226,9 +236,9 @@ class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
        assert_mean_pixel_difference(image, expected_image)
    def test_stable_unclip_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
-        torch.cuda.reset_max_memory_allocated()
+        backend_reset_max_memory_allocated(torch_device)
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
        pipe.set_progress_bar_config(disable=None)
@@ -242,6 +252,6 @@ class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
            output_type="np",
        )
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 7 GB is allocated
        assert mem_bytes < 7 * 10**9
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -18,12 +18,16 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    torch_device,
 )
@@ -213,19 +217,19 @@ class StableUnCLIPImg2ImgPipelineFastTests(
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_stable_unclip_l_img2img(self):
        input_image = load_image(
@@ -286,9 +290,9 @@ class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
        )
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
-        torch.cuda.reset_max_memory_allocated()
+        backend_reset_max_memory_allocated(torch_device)
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
@@ -304,6 +308,6 @@ class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
            output_type="np",
        )
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 7 GB is allocated
        assert mem_bytes < 7 * 10**9
--- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
@@ -19,37 +19,44 @@ import unittest
 import torch
 from diffusers import DDIMScheduler, TextToVideoZeroPipeline
-from diffusers.utils.testing_utils import load_pt, nightly, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    load_pt,
+    nightly,
+    require_torch_accelerator,
+    torch_device,
+)
 from ..test_pipelines_common import assert_mean_pixel_difference
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_full_model(self):
        model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        generator = torch.Generator(device="cuda").manual_seed(0)
+        generator = torch.Generator(device="cpu").manual_seed(0)
        prompt = "A bear is playing a guitar on Times Square"
        result = pipe(prompt=prompt, generator=generator).images
        expected_result = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt",
+            weights_only=False,
        )
        assert_mean_pixel_difference(result, expected_result)
--- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
@@ -24,11 +24,11 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProject
 from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoZeroSDXLPipeline, UNet2DConditionModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    nightly,
    require_accelerate_version_greater,
-    require_accelerator,
+    require_torch_accelerator,
-    require_torch_gpu,
    torch_device,
 )
@@ -220,7 +220,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
        self.assertLess(max_diff, expected_max_difference)
    @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
-    @require_accelerator
+    @require_torch_accelerator
    def test_float16_inference(self, expected_max_diff=5e-2):
        components = self.get_dummy_components()
        for name, module in components.items():
@@ -262,7 +262,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
    def test_inference_batch_single_identical(self):
        pass
-    @require_accelerator
+    @require_torch_accelerator
    @require_accelerate_version_greater("0.17.0")
    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
        components = self.get_dummy_components()
@@ -285,7 +285,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
        pass
    @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
-    @require_accelerator
+    @require_torch_accelerator
    def test_save_load_float16(self, expected_max_diff=1e-2):
        components = self.get_dummy_components()
        for name, module in components.items():
@@ -337,7 +337,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
    def test_sequential_cpu_offload_forward_pass(self):
        pass
-    @require_accelerator
+    @require_torch_accelerator
    def test_to_device(self):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
@@ -365,19 +365,19 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class TextToVideoZeroSDXLPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_full_model(self):
        model_id = "stabilityai/stable-diffusion-xl-base-1.0"

--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -23,10 +23,14 @@ from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokeni
 from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    load_numpy,
    nightly,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    torch_device,
 )
@@ -426,13 +430,13 @@ class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_unclip_karlo_cpu_fp32(self):
        expected_image = load_numpy(
@@ -458,19 +462,19 @@ class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class UnCLIPPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_unclip_karlo(self):
        expected_image = load_numpy(
@@ -496,9 +500,9 @@ class UnCLIPPipelineIntegrationTests(unittest.TestCase):
        assert_mean_pixel_difference(image, expected_image)
    def test_unclip_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
-        torch.cuda.reset_max_memory_allocated()
+        backend_reset_max_memory_allocated(torch_device)
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
        pipe.set_progress_bar_config(disable=None)
@@ -514,6 +518,6 @@ class UnCLIPPipelineIntegrationTests(unittest.TestCase):
            output_type="np",
        )
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 7 GB is allocated
        assert mem_bytes < 7 * 10**9
--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -37,12 +37,13 @@ from diffusers import (
 )
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    torch_device,
 )
@@ -496,19 +497,19 @@ class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCa
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_unclip_image_variation_karlo(self):
        input_image = load_image(