enable torchao test cases on XPU and switch to device agnostic APIs for test cases (#11654)

* enable torchao cases on XPU Signed-off-by: Matrix YAO <matrix.yao@intel.com> * device agnostic APIs Signed-off-by: YAO Matrix <matrix.yao@intel.com> * more Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * enable test_torch_compile_recompilation_and_graph_break on XPU Signed-off-by: YAO Matrix <matrix.yao@intel.com> * resolve comments Signed-off-by: YAO Matrix <matrix.yao@intel.com> --------- Signed-off-by: Matrix YAO <matrix.yao@intel.com> Signed-off-by: YAO Matrix <matrix.yao@intel.com>

enable torchao test cases on XPU and switch to device agnostic APIs for test cases (#11654)
* enable torchao cases on XPU Signed-off-by: Matrix YAO <matrix.yao@intel.com> * device agnostic APIs Signed-off-by: YAO Matrix <matrix.yao@intel.com> * more Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * enable test_torch_compile_recompilation_and_graph_break on XPU Signed-off-by: YAO Matrix <matrix.yao@intel.com> * resolve comments Signed-off-by: YAO Matrix <matrix.yao@intel.com> --------- Signed-off-by: Matrix YAO <matrix.yao@intel.com> Signed-off-by: YAO Matrix <matrix.yao@intel.com>
33e636ce · Yao Matrix · GitHub · e27142ac · 33e636ce · 33e636ce
Unverified Commit 33e636ce authored Jun 11, 2025 by Yao Matrix Committed by GitHub Jun 11, 2025
10 changed files
--- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
+++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
@@ -233,7 +233,7 @@ class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):

 @slow
 @require_big_accelerator
-@pytest.mark.big_gpu_with_torch_cuda
+@pytest.mark.big_accelerator
 class StableDiffusion3PipelineSlowTests(unittest.TestCase):
    pipeline_class = StableDiffusion3Pipeline
    repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"

--- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
+++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
@@ -168,7 +168,7 @@ class StableDiffusion3Img2ImgPipelineFastTests(PipelineLatentTesterMixin, unitte

 @slow
 @require_big_accelerator
-@pytest.mark.big_gpu_with_torch_cuda
+@pytest.mark.big_accelerator
 class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
    pipeline_class = StableDiffusion3Img2ImgPipeline
    repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"

--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -35,6 +35,7 @@ from diffusers import (
    UniPCMultistepScheduler,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    load_image,
    numpy_cosine_similarity_distance,
@@ -940,12 +941,12 @@ class StableDiffusionXLPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_diffusion_lcm(self):
        torch.manual_seed(0)

--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -39,6 +39,7 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
@@ -670,12 +671,12 @@ class StableDiffusionXLImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_diffusion_xl_img2img_playground(self):
        torch.manual_seed(0)

--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -1218,13 +1218,13 @@ class PipelineFastTests(unittest.TestCase):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def dummy_image(self):
        batch_size = 1

--- a/tests/pipelines/wan/test_wan.py
+++ b/tests/pipelines/wan/test_wan.py
@@ -21,9 +21,11 @@ from transformers import AutoTokenizer, T5EncoderModel

 from diffusers import AutoencoderKLWan, FlowMatchEulerDiscreteScheduler, WanPipeline, WanTransformer3DModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    require_torch_accelerator,
    slow,
+    torch_device,
 )

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -144,12 +146,12 @@ class WanPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    @unittest.skip("TODO: test needs to be implemented")
    def test_Wanx(self):

--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -30,13 +30,15 @@ from diffusers import (
 )
 from diffusers.models.attention_processor import Attention
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_synchronize,
    enable_full_determinism,
    is_torch_available,
    is_torchao_available,
    nightly,
    numpy_cosine_similarity_distance,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_torchao_version_greater_or_equal,
    slow,
    torch_device,
@@ -61,7 +63,7 @@ if is_torchao_available():


 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 class TorchAoConfigTest(unittest.TestCase):
    def test_to_dict(self):
@@ -79,7 +81,7 @@ class TorchAoConfigTest(unittest.TestCase):
        Test kwargs validations in TorchAoConfig
        """
        _ = TorchAoConfig("int4_weight_only")
-        with self.assertRaisesRegex(ValueError, "is not supported yet"):
+        with self.assertRaisesRegex(ValueError, "is not supported"):
            _ = TorchAoConfig("uint8")

        with self.assertRaisesRegex(ValueError, "does not support the following keyword arguments"):
@@ -119,12 +121,12 @@ class TorchAoConfigTest(unittest.TestCase):

 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 class TorchAoTest(unittest.TestCase):
    def tearDown(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_dummy_components(
        self, quantization_config: TorchAoConfig, model_id: str = "hf-internal-testing/tiny-flux-pipe"
@@ -269,6 +271,7 @@ class TorchAoTest(unittest.TestCase):
            subfolder="transformer",
            quantization_config=quantization_config,
            torch_dtype=torch.bfloat16,
+            device_map=f"{torch_device}:0",
        )

        weight = quantized_model.transformer_blocks[0].ff.net[2].weight
@@ -338,7 +341,7 @@ class TorchAoTest(unittest.TestCase):

                output = quantized_model(**inputs)[0]
                output_slice = output.flatten()[-9:].detach().float().cpu().numpy()
-                self.assertTrue(numpy_cosine_similarity_distance(output_slice, expected_slice) < 1e-3)
+                self.assertTrue(numpy_cosine_similarity_distance(output_slice, expected_slice) < 2e-3)

            with tempfile.TemporaryDirectory() as offload_folder:
                quantization_config = TorchAoConfig("int4_weight_only", group_size=64)
@@ -359,7 +362,7 @@ class TorchAoTest(unittest.TestCase):

                output = quantized_model(**inputs)[0]
                output_slice = output.flatten()[-9:].detach().float().cpu().numpy()
-                self.assertTrue(numpy_cosine_similarity_distance(output_slice, expected_slice) < 1e-3)
+                self.assertTrue(numpy_cosine_similarity_distance(output_slice, expected_slice) < 2e-3)

    def test_modules_to_not_convert(self):
        quantization_config = TorchAoConfig("int8_weight_only", modules_to_not_convert=["transformer_blocks.0"])
@@ -518,14 +521,14 @@ class TorchAoTest(unittest.TestCase):

 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 class TorchAoSerializationTest(unittest.TestCase):
    model_name = "hf-internal-testing/tiny-flux-pipe"

    def tearDown(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_dummy_model(self, quant_method, quant_method_kwargs, device=None):
        quantization_config = TorchAoConfig(quant_method, **quant_method_kwargs)
@@ -593,17 +596,17 @@ class TorchAoSerializationTest(unittest.TestCase):
        )
        self.assertTrue(numpy_cosine_similarity_distance(output_slice, expected_slice) < 1e-3)

-    def test_int_a8w8_cuda(self):
+    def test_int_a8w8_accelerator(self):
        quant_method, quant_method_kwargs = "int8_dynamic_activation_int8_weight", {}
        expected_slice = np.array([0.3633, -0.1357, -0.0188, -0.249, -0.4688, 0.5078, -0.1289, -0.6914, 0.4551])
-        device = "cuda"
+        device = torch_device
        self._test_original_model_expected_slice(quant_method, quant_method_kwargs, expected_slice)
        self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device)

-    def test_int_a16w8_cuda(self):
+    def test_int_a16w8_accelerator(self):
        quant_method, quant_method_kwargs = "int8_weight_only", {}
        expected_slice = np.array([0.3613, -0.127, -0.0223, -0.2539, -0.459, 0.4961, -0.1357, -0.6992, 0.4551])
-        device = "cuda"
+        device = torch_device
        self._test_original_model_expected_slice(quant_method, quant_method_kwargs, expected_slice)
        self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device)

@@ -624,14 +627,14 @@ class TorchAoSerializationTest(unittest.TestCase):

 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 @slow
 @nightly
 class SlowTorchAoTests(unittest.TestCase):
    def tearDown(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_dummy_components(self, quantization_config: TorchAoConfig):
        # This is just for convenience, so that we can modify it at one place for custom environments and locally testing
@@ -713,8 +716,8 @@ class SlowTorchAoTests(unittest.TestCase):
            quantization_config = TorchAoConfig(quant_type=quantization_name, modules_to_not_convert=["x_embedder"])
            self._test_quant_type(quantization_config, expected_slice)
            gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
+            backend_empty_cache(torch_device)
+            backend_synchronize(torch_device)

    def test_serialization_int8wo(self):
        quantization_config = TorchAoConfig("int8wo")
@@ -733,8 +736,8 @@ class SlowTorchAoTests(unittest.TestCase):
            pipe.remove_all_hooks()
            del pipe.transformer
            gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
+            backend_empty_cache(torch_device)
+            backend_synchronize(torch_device)
            transformer = FluxTransformer2DModel.from_pretrained(
                tmp_dir, torch_dtype=torch.bfloat16, use_safetensors=False
            )
@@ -783,14 +786,14 @@ class SlowTorchAoTests(unittest.TestCase):


 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @require_torchao_version_greater_or_equal("0.7.0")
 @slow
 @nightly
 class SlowTorchAoPreserializedModelTests(unittest.TestCase):
    def tearDown(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_dummy_inputs(self, device: torch.device, seed: int = 0):
        if str(device).startswith("mps"):

--- a/tests/single_file/test_lumina2_transformer.py
+++ b/tests/single_file/test_lumina2_transformer.py
@@ -16,8 +16,6 @@
 import gc
 import unittest

-import torch
-
 from diffusers import (
    Lumina2Transformer2DModel,
 )
@@ -66,9 +64,9 @@ class Lumina2Transformer2DModelSingleFileTests(unittest.TestCase):

    def test_checkpoint_loading(self):
        for ckpt_path in self.alternate_keys_ckpt_paths:
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
            model = self.model_class.from_single_file(ckpt_path)

            del model
            gc.collect()
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
--- a/tests/single_file/test_model_flux_transformer_single_file.py
+++ b/tests/single_file/test_model_flux_transformer_single_file.py
@@ -16,8 +16,6 @@
 import gc
 import unittest

-import torch
-
 from diffusers import (
    FluxTransformer2DModel,
 )
@@ -64,9 +62,9 @@ class FluxTransformer2DModelSingleFileTests(unittest.TestCase):

    def test_checkpoint_loading(self):
        for ckpt_path in self.alternate_keys_ckpt_paths:
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
            model = self.model_class.from_single_file(ckpt_path)

            del model
            gc.collect()
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
--- a/tests/single_file/test_sana_transformer.py
+++ b/tests/single_file/test_sana_transformer.py
 import gc
 import unittest

-import torch
-
 from diffusers import (
    SanaTransformer2DModel,
 )
@@ -53,9 +51,9 @@ class SanaTransformer2DModelSingleFileTests(unittest.TestCase):

    def test_checkpoint_loading(self):
        for ckpt_path in self.alternate_keys_ckpt_paths:
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
            model = self.model_class.from_single_file(ckpt_path)

            del model
            gc.collect()
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)