[tests] make tests device-agnostic (part 4) (#10508)

* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * add changes * bug fix * enable on xpu * update more cases * revert * revert back * Update test_stable_diffusion_xl.py * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Apply suggestions from code review Co-authored-by: hlky <hlky@hlky.ac> * add test marker --------- Co-authored-by: hlky <hlky@hlky.ac>

[tests] make tests device-agnostic (part 4) (#10508)
* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * add changes * bug fix * enable on xpu * update more cases * revert * revert back * Update test_stable_diffusion_xl.py * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Apply suggestions from code review Co-authored-by: hlky <hlky@hlky.ac> * add test marker --------- Co-authored-by: hlky <hlky@hlky.ac>
7855ac59 · Fanli Lin · GitHub · 30cef6bf · 7855ac59 · 7855ac59
Unverified Commit 7855ac59 authored Mar 04, 2025 by Fanli Lin Committed by GitHub Mar 04, 2025
20 changed files
--- a/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
+++ b/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
@@ -24,11 +24,12 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadeDecoderPipeline
 from diffusers.models import StableCascadeUNet
 from diffusers.pipelines.wuerstchen import PaellaVQModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    load_numpy,
    load_pt,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    slow,
    torch_device,
@@ -278,25 +279,25 @@ class StableCascadeDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCa


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_cascade_decoder(self):
        pipe = StableCascadeDecoderPipeline.from_pretrained(
            "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."

--- a/tests/pipelines/stable_cascade/test_stable_cascade_prior.py
+++ b/tests/pipelines/stable_cascade/test_stable_cascade_prior.py
@@ -24,11 +24,12 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadePriorPipeline
 from diffusers.models import StableCascadeUNet
 from diffusers.utils.import_utils import is_peft_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    load_numpy,
    numpy_cosine_similarity_distance,
    require_peft_backend,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    slow,
    torch_device,
@@ -246,25 +247,25 @@ class StableCascadePriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableCascadePriorPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_cascade_prior(self):
        pipe = StableCascadePriorPipeline.from_pretrained(
            "stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -44,6 +44,10 @@ from diffusers import (
 )
 from diffusers.utils.testing_utils import (
    CaptureLogger,
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    is_torch_compile,
    load_image,
@@ -52,7 +56,7 @@ from diffusers.utils.testing_utils import (
    numpy_cosine_similarity_distance,
    require_accelerate_version_greater,
    require_torch_2,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_torch_multi_gpu,
    run_test_in_subprocess,
    skip_mps,
@@ -781,11 +785,11 @@ class StableDiffusionPipelineFastTests(


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -887,7 +891,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        assert np.abs(image_slice - expected_slice).max() < 3e-3

    def test_stable_diffusion_attention_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
        pipe.unet.set_default_attn_processor()
        pipe = pipe.to(torch_device)
@@ -898,8 +902,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image_sliced = pipe(**inputs).images

-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        mem_bytes = backend_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
        # make sure that less than 3.75 GB is allocated
        assert mem_bytes < 3.75 * 10**9

@@ -910,13 +914,13 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        image = pipe(**inputs).images

        # make sure that more than 3.75 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes > 3.75 * 10**9
        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
        assert max_diff < 1e-3

    def test_stable_diffusion_vae_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
        pipe = pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
@@ -929,8 +933,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
        image_sliced = pipe(**inputs).images

-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        mem_bytes = backend_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
        # make sure that less than 4 GB is allocated
        assert mem_bytes < 4e9

@@ -942,14 +946,14 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        image = pipe(**inputs).images

        # make sure that more than 4 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes > 4e9
        # There is a small discrepancy at the image borders vs. a fully batched version.
        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
        assert max_diff < 1e-2

    def test_stable_diffusion_vae_tiling(self):
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        model_id = "CompVis/stable-diffusion-v1-4"
        pipe = StableDiffusionPipeline.from_pretrained(
            model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None
@@ -963,7 +967,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):

        # enable vae tiling
        pipe.enable_vae_tiling()
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        generator = torch.Generator(device="cpu").manual_seed(0)
        output_chunked = pipe(
            [prompt],
@@ -976,7 +980,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        )
        image_chunked = output_chunked.images

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)

        # disable vae tiling
        pipe.disable_vae_tiling()
@@ -1069,26 +1073,25 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        assert 2 * low_cpu_mem_usage_time < normal_load_time

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.8 GB is allocated
        assert mem_bytes < 2.8 * 10**9

    def test_stable_diffusion_pipeline_with_model_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        inputs = self.get_inputs(torch_device, dtype=torch.float16)

@@ -1102,7 +1105,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        outputs = pipe(**inputs)
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)

        # With model offloading

@@ -1113,16 +1116,16 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        )
        pipe.unet.set_default_attn_processor()

-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device, dtype=torch.float16)

        outputs_offloaded = pipe(**inputs)
-        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
+        mem_bytes_offloaded = backend_max_memory_allocated(torch_device)

        images = outputs.images
        offloaded_images = outputs_offloaded.images
@@ -1135,13 +1138,13 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
            assert module.device == torch.device("cpu")

        # With attention slicing
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        pipe.enable_attention_slicing()
        _ = pipe(**inputs)
-        mem_bytes_slicing = torch.cuda.max_memory_allocated()
+        mem_bytes_slicing = backend_max_memory_allocated(torch_device)

        assert mem_bytes_slicing < mem_bytes_offloaded
        assert mem_bytes_slicing < 3 * 10**9
@@ -1156,7 +1159,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        )
        pipe.load_textual_inversion(a111_file)
        pipe.load_textual_inversion(a111_file_neg)
-        pipe.to("cuda")
+        pipe.to(torch_device)

        generator = torch.Generator(device="cpu").manual_seed(1)

@@ -1173,7 +1176,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):

    def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
@@ -1198,8 +1201,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):

    def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        pipe.enable_sequential_cpu_offload()
-        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
+        pipe.enable_sequential_cpu_offload(device=torch_device)
+        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons").to(torch_device)

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
        a111_file_neg = hf_hub_download(
@@ -1257,17 +1260,17 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPipelineCkptTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_download_from_hub(self):
        ckpt_paths = [
@@ -1278,7 +1281,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
        for ckpt_path in ckpt_paths:
            pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
            pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-            pipe.to("cuda")
+            pipe.to(torch_device)

        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]

@@ -1294,7 +1297,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
            ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16
        )
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to("cuda")
+        pipe.to(torch_device)

        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]

@@ -1302,17 +1305,17 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -1412,7 +1415,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, generator_device="cpu", seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -35,6 +35,10 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    is_torch_compile,
@@ -42,7 +46,7 @@ from diffusers.utils.testing_utils import (
    load_numpy,
    nightly,
    require_torch_2,
-    require_torch_gpu,
+    require_torch_accelerator,
    run_test_in_subprocess,
    skip_mps,
    slow,
@@ -400,17 +404,17 @@ class StableDiffusionImg2ImgPipelineFastTests(


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -513,28 +517,28 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
        assert number_of_steps == 2

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
        )
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.2 GB is allocated
        assert mem_bytes < 2.2 * 10**9

    def test_stable_diffusion_pipeline_with_model_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        inputs = self.get_inputs(torch_device, dtype=torch.float16)

@@ -548,7 +552,7 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe(**inputs)
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)

        # With model offloading

@@ -559,14 +563,14 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
            torch_dtype=torch.float16,
        )

-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)
        _ = pipe(**inputs)
-        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
+        mem_bytes_offloaded = backend_max_memory_allocated(torch_device)

        assert mem_bytes_offloaded < mem_bytes
        for module in pipe.text_encoder, pipe.unet, pipe.vae:
@@ -663,17 +667,17 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -37,6 +37,10 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    is_torch_compile,
@@ -44,7 +48,7 @@ from diffusers.utils.testing_utils import (
    load_numpy,
    nightly,
    require_torch_2,
-    require_torch_gpu,
+    require_torch_accelerator,
    run_test_in_subprocess,
    slow,
    torch_device,
@@ -602,7 +606,7 @@ class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipeli


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
@@ -610,7 +614,7 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -704,21 +708,21 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
        assert np.abs(expected_slice - image_slice).max() < 6e-3

    def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        pipe = StableDiffusionInpaintPipeline.from_pretrained(
            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16
        )
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.2 GB is allocated
        assert mem_bytes < 2.2 * 10**9

@@ -793,7 +797,7 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
@@ -801,7 +805,7 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -907,9 +911,9 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
        assert np.abs(expected_slice - image_slice).max() < 6e-3

    def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        vae = AsymmetricAutoencoderKL.from_pretrained(
            "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16
@@ -920,12 +924,12 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
        pipe.vae = vae
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.45 GB is allocated
        assert mem_bytes < 2.45 * 10**9

@@ -1009,7 +1013,7 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
        pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16)
        pipe.vae = vae
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to("cuda")
+        pipe.to(torch_device)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 1
@@ -1019,17 +1023,17 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -33,10 +33,14 @@ from diffusers import (
 )
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -266,17 +270,17 @@ class StableDiffusionInstructPix2PixPipelineFastTests(


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, seed=0):
        generator = torch.manual_seed(seed)
@@ -384,21 +388,21 @@ class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
        assert number_of_steps == 3

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
            "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
        )
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        inputs = self.get_inputs()
        _ = pipe(**inputs)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.2 GB is allocated
        assert mem_bytes < 2.2 * 10**9


--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -34,12 +34,13 @@ from diffusers import (
 from diffusers.utils.testing_utils import (
    CaptureLogger,
    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    load_numpy,
    nightly,
    numpy_cosine_similarity_distance,
    require_torch_accelerator,
-    require_torch_gpu,
    skip_mps,
    slow,
    torch_device,
@@ -330,9 +331,8 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        _generator_device = "cpu" if not generator_device.startswith("cuda") else "cuda"
        if not str(device).startswith("mps"):
-            generator = torch.Generator(device=_generator_device).manual_seed(seed)
+            generator = torch.Generator(device=generator_device).manual_seed(seed)
        else:
            generator = torch.manual_seed(seed)

@@ -361,9 +361,9 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
        expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
        assert np.abs(image_slice - expected_slice).max() < 7e-3

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_stable_diffusion_attention_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        pipe = StableDiffusionPipeline.from_pretrained(
            "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16
        )
@@ -376,8 +376,8 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image_sliced = pipe(**inputs).images

-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        mem_bytes = backend_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
        # make sure that less than 3.3 GB is allocated
        assert mem_bytes < 3.3 * 10**9

@@ -388,7 +388,7 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
        image = pipe(**inputs).images

        # make sure that more than 3.3 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes > 3.3 * 10**9
        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_sliced.flatten())
        assert max_diff < 5e-3

--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -37,6 +37,7 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
@@ -44,7 +45,7 @@ from diffusers.utils.testing_utils import (
    nightly,
    require_accelerate_version_greater,
    require_accelerator,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    slow,
    torch_device,
@@ -378,17 +379,17 @@ class StableDiffusionDepth2ImgPipelineFastTests(


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=device).manual_seed(seed)
@@ -425,17 +426,17 @@ class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=device).manual_seed(seed)

--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -33,12 +33,13 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    nightly,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    torch_device,
 )

@@ -299,18 +300,18 @@ class StableDiffusionDiffEditPipelineFastTests(
        return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)


-@require_torch_gpu
+@require_torch_accelerator
 @nightly
 class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    @classmethod
    def setUpClass(cls):
@@ -331,7 +332,7 @@ class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
        pipe.scheduler.clip_sample = True

        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        source_prompt = "a bowl of fruit"
@@ -377,17 +378,17 @@ class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    @classmethod
    def setUpClass(cls):

--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -24,11 +24,14 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer

 from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -161,19 +164,19 @@ class StableDiffusion2InpaintPipelineFastTests(


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_diffusion_inpaint_pipeline(self):
        init_image = load_image(
@@ -248,9 +251,9 @@ class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
        assert np.abs(expected_image - image).max() < 5e-1

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        init_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
@@ -270,7 +273,7 @@ class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
        )
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"


--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -31,11 +31,12 @@ from diffusers import (
 )
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -284,29 +285,29 @@ class StableDiffusionLatentUpscalePipelineFastTests(
        pass


-@require_torch_gpu
+@require_torch_accelerator
 @slow
 class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_latent_upscaler_fp16(self):
        generator = torch.manual_seed(33)

        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-        pipe.to("cuda")
+        pipe.to(torch_device)

        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
            "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
        )
-        upscaler.to("cuda")
+        upscaler.to(torch_device)

        prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"

@@ -332,7 +333,7 @@ class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase):
        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
            "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
        )
-        upscaler.to("cuda")
+        upscaler.to(torch_device)

        prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas"


--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -25,12 +25,16 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer

 from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    require_accelerator,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -44,13 +48,13 @@ class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    @property
    def dummy_image(self):
@@ -381,19 +385,19 @@ class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_diffusion_upscale_pipeline(self):
        image = load_image(
@@ -459,9 +463,9 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
        assert np.abs(expected_image - image).max() < 5e-1

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
@@ -475,7 +479,7 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
        )
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        prompt = "a cat sitting on a park bench"

@@ -488,6 +492,6 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
            output_type="np",
        )

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.9 GB is allocated
        assert mem_bytes < 2.9 * 10**9
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -31,11 +31,15 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    load_numpy,
    numpy_cosine_similarity_distance,
    require_accelerator,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -49,13 +53,13 @@ class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    @property
    def dummy_cond_unet(self):
@@ -258,19 +262,19 @@ class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_diffusion_v_pred_default(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
@@ -357,7 +361,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_attention_slicing_v_pred(self):
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
        model_id = "stabilityai/stable-diffusion-2"
        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
        pipe.to(torch_device)
@@ -373,8 +377,8 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
        )
        image_chunked = output_chunked.images

-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        mem_bytes = backend_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
        # make sure that less than 5.5 GB is allocated
        assert mem_bytes < 5.5 * 10**9

@@ -385,7 +389,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
        image = output.images

        # make sure that more than 3.0 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes > 3 * 10**9
        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten())
        assert max_diff < 1e-3
@@ -421,7 +425,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
        pipe.scheduler = DDIMScheduler.from_config(
            pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
@@ -466,7 +470,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):

        pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16)
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]

@@ -530,20 +534,20 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
        assert 2 * low_cpu_mem_usage_time < normal_load_time

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        pipeline_id = "stabilityai/stable-diffusion-2"
        prompt = "Andromeda galaxy in a bottle"

        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
        pipeline.enable_attention_slicing(1)
-        pipeline.enable_sequential_cpu_offload()
+        pipeline.enable_sequential_cpu_offload(device=torch_device)

        generator = torch.manual_seed(0)
        _ = pipeline(prompt, generator=generator, num_inference_steps=5)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.8 GB is allocated
        assert mem_bytes < 2.8 * 10**9
--- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
+++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
@@ -8,6 +8,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProject

 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    numpy_cosine_similarity_distance,
    require_big_gpu_with_torch_cuda,
    slow,
@@ -240,12 +241,12 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
@@ -263,7 +264,7 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase):

    def test_sd3_inference(self):
        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device)


--- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
+++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
@@ -15,6 +15,7 @@ from diffusers import (
 )
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    floats_tensor,
    numpy_cosine_similarity_distance,
    require_big_gpu_with_torch_cuda,
@@ -174,12 +175,12 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, seed=0):
        init_image = load_image(
@@ -202,7 +203,7 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):

    def test_sd3_img2img_inference(self):
        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device)


--- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
+++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
@@ -35,12 +35,13 @@ from diffusers import (
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -604,17 +605,17 @@ class StableDiffusionMultiAdapterPipelineFastTests(AdapterTests, PipelineTesterM


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_stable_diffusion_adapter_depth_sd_v15(self):
        adapter_model = "TencentARC/t2iadapter_depth_sd15v2"

--- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py
@@ -30,13 +30,17 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -164,17 +168,17 @@ class StableDiffusionImageVariationPipelineFastTests(


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -258,37 +262,37 @@ class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
        assert number_of_steps == inputs["num_inference_steps"]

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
            "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16
        )
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 2.6 GB is allocated
        assert mem_bytes < 2.6 * 10**9


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)

--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -38,7 +38,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    load_image,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -265,7 +265,7 @@ class StableDiffusionXLPipelineFastTests(
    def test_inference_batch_single_identical(self):
        super().test_inference_batch_single_identical(expected_max_diff=3e-3)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -274,12 +274,12 @@ class StableDiffusionXLPipelineFastTests(

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLPipeline(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLPipeline(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []

--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -42,7 +42,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -293,7 +293,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(

        assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -302,12 +302,12 @@ class StableDiffusionXLImg2ImgPipelineFastTests(

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -596,7 +596,7 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests(

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -605,12 +605,12 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests(

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []

--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
@@ -41,7 +41,13 @@ from diffusers import (
    UNet2DConditionModel,
    UniPCMultistepScheduler,
 )
-from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)

 from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
@@ -305,7 +311,48 @@ class StableDiffusionXLInpaintPipelineFastTests(
    def test_save_load_optional_components(self):
        pass

-    @require_torch_gpu
+    @require_torch_accelerator
+    def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        prompt = 3 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    @require_torch_accelerator
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -314,12 +361,12 @@ class StableDiffusionXLInpaintPipelineFastTests(

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = StableDiffusionXLInpaintPipeline(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []