enable 7 cases on XPU (#11503)

* enable 7 cases on XPU Signed-off-by: Yao Matrix <matrix.yao@intel.com> * calibrate A100 expectations Signed-off-by: YAO Matrix <matrix.yao@intel.com> --------- Signed-off-by: Yao Matrix <matrix.yao@intel.com> Signed-off-by: YAO Matrix <matrix.yao@intel.com>

enable 7 cases on XPU (#11503)
* enable 7 cases on XPU Signed-off-by: Yao Matrix <matrix.yao@intel.com> * calibrate A100 expectations Signed-off-by: YAO Matrix <matrix.yao@intel.com> --------- Signed-off-by: Yao Matrix <matrix.yao@intel.com> Signed-off-by: YAO Matrix <matrix.yao@intel.com>
2d380895 · Yao Matrix · GitHub · 0c47c954 · 2d380895 · 2d380895
Unverified Commit 2d380895 authored May 09, 2025 by Yao Matrix Committed by GitHub May 09, 2025
7 changed files
--- a/tests/pipelines/consisid/test_consisid.py
+++ b/tests/pipelines/consisid/test_consisid.py
@@ -24,9 +24,10 @@ from transformers import AutoTokenizer, T5EncoderModel
 from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -316,19 +317,19 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class ConsisIDPipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_consisid(self):
        generator = torch.Generator("cpu").manual_seed(0)
@@ -338,8 +339,8 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
        prompt = self.prompt
        image = load_image("https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true")
-        id_vit_hidden = [torch.ones([1, 2, 2])] * 1
+        id_vit_hidden = [torch.ones([1, 577, 1024])] * 5
-        id_cond = torch.ones(1, 2)
+        id_cond = torch.ones(1, 1280)
        videos = pipe(
            image=image,
@@ -357,5 +358,5 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
        video = videos[0]
        expected_video = torch.randn(1, 16, 480, 720, 3).numpy()
-        max_diff = numpy_cosine_similarity_distance(video, expected_video)
+        max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
        assert max_diff < 1e-3, f"Max diff is too high. got {video}"
--- a/tests/pipelines/easyanimate/test_easyanimate.py
+++ b/tests/pipelines/easyanimate/test_easyanimate.py
@@ -27,9 +27,10 @@ from diffusers import (
    FlowMatchEulerDiscreteScheduler,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -256,19 +257,19 @@ class EasyAnimatePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class EasyAnimatePipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_EasyAnimate(self):
        generator = torch.Generator("cpu").manual_seed(0)

--- a/tests/pipelines/mochi/test_mochi.py
+++ b/tests/pipelines/mochi/test_mochi.py
@@ -27,8 +27,8 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    nightly,
    numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
-    require_torch_gpu,
+    require_torch_accelerator,
    torch_device,
 )
@@ -266,9 +266,9 @@ class MochiPipelineFastTests(PipelineTesterMixin, FasterCacheTesterMixin, unitte
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
-@pytest.mark.big_gpu_with_torch_cuda
+@pytest.mark.big_accelerator
 class MochiPipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."
@@ -302,5 +302,5 @@ class MochiPipelineIntegrationTests(unittest.TestCase):
        video = videos[0]
        expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
-        max_diff = numpy_cosine_similarity_distance(video, expected_video)
+        max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
        assert max_diff < 1e-3, f"Max diff is too high. got {video}"
--- a/tests/pipelines/omnigen/test_pipeline_omnigen.py
+++ b/tests/pipelines/omnigen/test_pipeline_omnigen.py
@@ -7,8 +7,10 @@ from transformers import AutoTokenizer
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel
 from diffusers.utils.testing_utils import (
+    Expectations,
+    backend_empty_cache,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -87,7 +89,7 @@ class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class OmniGenPipelineSlowTests(unittest.TestCase):
    pipeline_class = OmniGenPipeline
    repo_id = "shitao/OmniGen-v1-diffusers"
@@ -95,12 +97,12 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
@@ -125,21 +127,56 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
        image = pipe(**inputs).images[0]
        image_slice = image[0, :10, :10]
-        expected_slice = np.array(
+        expected_slices = Expectations(
-            [
+            {
-                [0.1783447, 0.16772744, 0.14339337],
+                ("xpu", 3): np.array(
-                [0.17066911, 0.15521264, 0.13757327],
+                    [
-                [0.17072496, 0.15531206, 0.13524258],
+                        [0.05859375, 0.05859375, 0.04492188],
-                [0.16746324, 0.1564025, 0.13794944],
+                        [0.04882812, 0.04101562, 0.03320312],
-                [0.16490817, 0.15258026, 0.13697758],
+                        [0.04882812, 0.04296875, 0.03125],
-                [0.16971767, 0.15826806, 0.13928896],
+                        [0.04296875, 0.0390625, 0.03320312],
-                [0.16782972, 0.15547255, 0.13783783],
+                        [0.04296875, 0.03710938, 0.03125],
-                [0.16464645, 0.15281534, 0.13522372],
+                        [0.04492188, 0.0390625, 0.03320312],
-                [0.16535294, 0.15301755, 0.13526791],
+                        [0.04296875, 0.03710938, 0.03125],
-                [0.16365296, 0.15092957, 0.13443318],
+                        [0.04101562, 0.03710938, 0.02734375],
-            ],
+                        [0.04101562, 0.03515625, 0.02734375],
-            dtype=np.float32,
+                        [0.04101562, 0.03515625, 0.02929688],
+                    ],
+                    dtype=np.float32,
+                ),
+                ("cuda", 7): np.array(
+                    [
+                        [0.1783447, 0.16772744, 0.14339337],
+                        [0.17066911, 0.15521264, 0.13757327],
+                        [0.17072496, 0.15531206, 0.13524258],
+                        [0.16746324, 0.1564025, 0.13794944],
+                        [0.16490817, 0.15258026, 0.13697758],
+                        [0.16971767, 0.15826806, 0.13928896],
+                        [0.16782972, 0.15547255, 0.13783783],
+                        [0.16464645, 0.15281534, 0.13522372],
+                        [0.16535294, 0.15301755, 0.13526791],
+                        [0.16365296, 0.15092957, 0.13443318],
+                    ],
+                    dtype=np.float32,
+                ),
+                ("cuda", 8): np.array(
+                    [
+                        [0.0546875, 0.05664062, 0.04296875],
+                        [0.046875, 0.04101562, 0.03320312],
+                        [0.05078125, 0.04296875, 0.03125],
+                        [0.04296875, 0.04101562, 0.03320312],
+                        [0.0390625, 0.03710938, 0.02929688],
+                        [0.04296875, 0.03710938, 0.03125],
+                        [0.0390625, 0.03710938, 0.02929688],
+                        [0.0390625, 0.03710938, 0.02734375],
+                        [0.0390625, 0.03320312, 0.02734375],
+                        [0.0390625, 0.03320312, 0.02734375],
+                    ],
+                    dtype=np.float32,
+                ),
+            }
        )
+        expected_slice = expected_slices.get_expectation()
        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())

--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -25,11 +25,12 @@ from transformers import CLIPImageProcessor, CLIPVisionConfig
 from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    nightly,
-    require_torch_gpu,
+    require_torch_accelerator,
    torch_device,
 )
@@ -174,19 +175,19 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_paint_by_example(self):
        # make sure here that pndm scheduler skips prk

--- a/tests/pipelines/stable_audio/test_stable_audio.py
+++ b/tests/pipelines/stable_audio/test_stable_audio.py
@@ -32,7 +32,14 @@ from diffusers import (
    StableAudioProjectionModel,
 )
 from diffusers.utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    Expectations,
+    backend_empty_cache,
+    enable_full_determinism,
+    nightly,
+    require_torch_accelerator,
+    torch_device,
+)
 from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
@@ -419,17 +426,17 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableAudioPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -459,9 +466,15 @@ class StableAudioPipelineIntegrationTests(unittest.TestCase):
        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
        audio_slice = audio[0, 447590:447600]
        # fmt: off
-        expected_slice = np.array(
+        expected_slices = Expectations(
-            [-0.0278,  0.1096,  0.1877,  0.3178,  0.5329,  0.6990,  0.6972,  0.6186, 0.5608,  0.5060]
+            {
+                ("xpu", 3): np.array([-0.0285, 0.1083, 0.1863, 0.3165, 0.5312, 0.6971, 0.6958, 0.6177, 0.5598, 0.5048]),
+                ("cuda", 7): np.array([-0.0278, 0.1096, 0.1877, 0.3178, 0.5329, 0.6990, 0.6972, 0.6186, 0.5608, 0.5060]),
+                ("cuda", 8): np.array([-0.0285, 0.1082, 0.1862, 0.3163, 0.5306, 0.6964, 0.6953, 0.6172, 0.5593, 0.5044]),
+            }
        )
-         # fmt: one
+        # fmt: on
+        expected_slice = expected_slices.get_expectation()
        max_diff = np.abs(expected_slice - audio_slice.detach().cpu().numpy()).max()
        assert max_diff < 1.5e-3
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -389,7 +389,7 @@ class BnB4BitBasicTests(Base4bitTests):
 class BnB4BitTrainingTests(Base4bitTests):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        nf4_config = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -657,7 +657,7 @@ class SlowBnb4BitTests(Base4bitTests):
 class SlowBnb4BitFluxTests(Base4bitTests):
    def setUp(self) -> None:
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        model_id = "hf-internal-testing/flux.1-dev-nf4-pkg"
        t5_4bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2")
@@ -674,7 +674,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
        del self.pipeline_4bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_quality(self):
        # keep the resolution and max tokens to a lower number for faster execution.
@@ -722,7 +722,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
 class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
    def setUp(self) -> None:
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        self.pipeline_4bit = FluxControlPipeline.from_pretrained("eramth/flux-4bit", torch_dtype=torch.float16)
        self.pipeline_4bit.enable_model_cpu_offload()
@@ -731,7 +731,7 @@ class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
        del self.pipeline_4bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_lora_loading(self):
        self.pipeline_4bit.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")