[tests] make tests device-agnostic (part 4) (#10508)

* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * add changes * bug fix * enable on xpu * update more cases * revert * revert back * Update test_stable_diffusion_xl.py * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Apply suggestions from code review Co-authored-by: hlky <hlky@hlky.ac> * add test marker --------- Co-authored-by: hlky <hlky@hlky.ac>

[tests] make tests device-agnostic (part 4) (#10508)
* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * add changes * bug fix * enable on xpu * update more cases * revert * revert back * Update test_stable_diffusion_xl.py * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Apply suggestions from code review Co-authored-by: hlky <hlky@hlky.ac> * add test marker --------- Co-authored-by: hlky <hlky@hlky.ac>
7855ac59 · Fanli Lin · GitHub · 30cef6bf · 7855ac59 · 7855ac59
Unverified Commit 7855ac59 authored Mar 04, 2025 by Fanli Lin Committed by GitHub Mar 04, 2025
20 changed files
--- a/tests/lora/test_lora_layers_sd.py
+++ b/tests/lora/test_lora_layers_sd.py
@@ -33,11 +33,12 @@ from diffusers import (
 )
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    load_image,
    nightly,
    numpy_cosine_similarity_distance,
    require_peft_backend,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -101,7 +102,7 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
    # Keeping this test here makes sense because it doesn't look any integration
    # (value assertions on logits).
    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    def test_integration_move_lora_cpu(self):
        path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
        lora_id = "takuma104/lora-test-text-encoder-lora-target"
@@ -158,7 +159,7 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
                self.assertTrue(m.weight.device != torch.device("cpu"))

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    def test_integration_move_lora_dora_cpu(self):
        from peft import LoraConfig

@@ -209,18 +210,18 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):

 @slow
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 @require_peft_backend
 class LoraIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_integration_logits_with_scale(self):
        path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
@@ -378,7 +379,7 @@ class LoraIntegrationTests(unittest.TestCase):
        generator = torch.Generator().manual_seed(0)

        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
        lora_filename = "light_and_shadow.safetensors"
        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
@@ -400,7 +401,7 @@ class LoraIntegrationTests(unittest.TestCase):
        generator = torch.Generator().manual_seed(0)

        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)
        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
        lora_filename = "light_and_shadow.safetensors"
        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
@@ -656,7 +657,7 @@ class LoraIntegrationTests(unittest.TestCase):
        See: https://github.com/huggingface/diffusers/issues/5606
        """
        pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
-        pipeline.enable_sequential_cpu_offload()
+        pipeline.enable_sequential_cpu_offload(device=torch_device)
        civitai_path = hf_hub_download("ybelkada/test-ahi-civitai", "ahi_lora_weights.safetensors")
        pipeline.load_lora_weights(civitai_path, adapter_name="ahri")


--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -30,12 +30,13 @@ from diffusers import (
 from diffusers.utils import load_image
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    is_flaky,
    nightly,
    numpy_cosine_similarity_distance,
    require_big_gpu_with_torch_cuda,
    require_peft_backend,
-    require_torch_gpu,
+    require_torch_accelerator,
    torch_device,
 )

@@ -93,7 +94,7 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
    def output_shape(self):
        return (1, 32, 32, 3)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_sd3_lora(self):
        """
        Test loading the loras that are saved with the diffusers and peft formats.
@@ -135,7 +136,7 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 @require_peft_backend
 @require_big_gpu_with_torch_cuda
 @pytest.mark.big_gpu_with_torch_cuda
@@ -146,12 +147,12 @@ class SD3LoraIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, seed=0):
        init_image = load_image(

--- a/tests/models/unets/test_models_unet_2d_condition.py
+++ b/tests/models/unets/test_models_unet_2d_condition.py
@@ -36,6 +36,9 @@ from diffusers.utils import logging
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    is_peft_available,
@@ -1002,7 +1005,7 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test
        assert loaded_model
        assert new_output.sample.shape == (4, 4, 16, 16)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_load_sharded_checkpoint_from_hub_local(self):
        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy")
@@ -1013,7 +1016,7 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test
        assert loaded_model
        assert new_output.sample.shape == (4, 4, 16, 16)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_load_sharded_checkpoint_from_hub_local_subfolder(self):
        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder")
@@ -1024,7 +1027,7 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test
        assert loaded_model
        assert new_output.sample.shape == (4, 4, 16, 16)

-    @require_torch_gpu
+    @require_torch_accelerator
    @parameterized.expand(
        [
            ("hf-internal-testing/unet2d-sharded-dummy", None),
@@ -1039,7 +1042,7 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test
        assert loaded_model
        assert new_output.sample.shape == (4, 4, 16, 16)

-    @require_torch_gpu
+    @require_torch_accelerator
    @parameterized.expand(
        [
            ("hf-internal-testing/unet2d-sharded-dummy-subfolder", None),
@@ -1054,7 +1057,7 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test
        assert loaded_model
        assert new_output.sample.shape == (4, 4, 16, 16)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_load_sharded_checkpoint_device_map_from_hub_local(self):
        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy")
@@ -1064,7 +1067,7 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test
        assert loaded_model
        assert new_output.sample.shape == (4, 4, 16, 16)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_load_sharded_checkpoint_device_map_from_hub_local_subfolder(self):
        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder")
@@ -1164,11 +1167,11 @@ class UNet2DConditionModelIntegrationTests(unittest.TestCase):

        return model

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_set_attention_slice_auto(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        unet = self.get_unet_model()
        unet.set_attention_slice("auto")
@@ -1180,15 +1183,15 @@ class UNet2DConditionModelIntegrationTests(unittest.TestCase):
        with torch.no_grad():
            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)

        assert mem_bytes < 5 * 10**9

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_set_attention_slice_max(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        unet = self.get_unet_model()
        unet.set_attention_slice("max")
@@ -1200,15 +1203,15 @@ class UNet2DConditionModelIntegrationTests(unittest.TestCase):
        with torch.no_grad():
            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)

        assert mem_bytes < 5 * 10**9

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_set_attention_slice_int(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        unet = self.get_unet_model()
        unet.set_attention_slice(2)
@@ -1220,15 +1223,15 @@ class UNet2DConditionModelIntegrationTests(unittest.TestCase):
        with torch.no_grad():
            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)

        assert mem_bytes < 5 * 10**9

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_set_attention_slice_list(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        # there are 32 sliceable layers
        slice_list = 16 * [2, 3]
@@ -1242,7 +1245,7 @@ class UNet2DConditionModelIntegrationTests(unittest.TestCase):
        with torch.no_grad():
            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)

        assert mem_bytes < 5 * 10**9


--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -79,7 +79,7 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.to("cuda")
+        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)

        pipe.unet.to(memory_format=torch.channels_last)

--- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py
@@ -40,7 +40,7 @@ from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    floats_tensor,
-    require_torch_gpu,
+    require_torch_accelerator,
    torch_device,
 )

@@ -245,7 +245,7 @@ class ControlNetPipelineSDXLFastTests(
    def test_inference_batch_single_identical(self):
        self._test_inference_batch_single_identical(expected_max_diff=2e-3)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -254,12 +254,12 @@ class ControlNetPipelineSDXLFastTests(

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []

--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -223,12 +223,12 @@ class StableDiffusionXLControlNetPipelineFastTests(

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []

--- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py
+++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -31,6 +31,7 @@ from diffusers import (
 from diffusers.models import FluxControlNetModel
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    nightly,
    numpy_cosine_similarity_distance,
@@ -217,12 +218,12 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_canny(self):
        controlnet = FluxControlNetModel.from_pretrained(

--- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
+++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -239,7 +239,7 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)

--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -9,6 +9,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPToken

 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    nightly,
    numpy_cosine_similarity_distance,
    require_big_gpu_with_torch_cuda,
@@ -212,12 +213,12 @@ class FluxPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, seed=0):
        generator = torch.Generator(device="cpu").manual_seed(seed)

--- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
+++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
@@ -34,11 +34,12 @@ from diffusers import (
 from diffusers.image_processor import IPAdapterMaskProcessor
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    is_flaky,
    load_pt,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -54,13 +55,13 @@ class IPAdapterNightlyTestsMixin(unittest.TestCase):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_image_encoder(self, repo_id, subfolder):
        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
@@ -165,7 +166,7 @@ class IPAdapterNightlyTestsMixin(unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):
    def test_text_to_image(self):
        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
@@ -280,7 +281,7 @@ class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):
        inputs = self.get_dummy_inputs()
        output_without_offload = pipeline(**inputs).images

-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        inputs = self.get_dummy_inputs()
        output_with_offload = pipeline(**inputs).images
        max_diff = np.abs(output_with_offload - output_without_offload).max()
@@ -391,7 +392,7 @@ class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
    def test_text_to_image_sdxl(self):
        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder")
@@ -403,7 +404,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
            feature_extractor=feature_extractor,
            torch_dtype=self.dtype,
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")

        inputs = self.get_dummy_inputs()
@@ -461,7 +462,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
            feature_extractor=feature_extractor,
            torch_dtype=self.dtype,
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")

        inputs = self.get_dummy_inputs(for_image_to_image=True)
@@ -530,7 +531,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
            feature_extractor=feature_extractor,
            torch_dtype=self.dtype,
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")

        inputs = self.get_dummy_inputs(for_inpainting=True)
@@ -578,7 +579,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
            image_encoder=image_encoder,
            torch_dtype=self.dtype,
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.load_ip_adapter(
            "h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors"
        )
@@ -606,7 +607,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
            image_encoder=image_encoder,
            torch_dtype=self.dtype,
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.load_ip_adapter(
            "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2
        )
@@ -633,7 +634,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
        pipeline = StableDiffusionXLPipeline.from_pretrained(
            "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, image_encoder=image_encoder, variant="fp16"
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)

        pipeline.load_ip_adapter(
            ["ostris/ip-composition-adapter", "h94/IP-Adapter"],
@@ -674,7 +675,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
            image_encoder=image_encoder,
            torch_dtype=self.dtype,
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.load_ip_adapter(
            "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"]
        )

--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -24,10 +24,11 @@ from transformers import XLMRobertaTokenizerFast
 from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_numpy,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -246,7 +247,7 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -255,12 +256,12 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -275,19 +276,19 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class KandinskyPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinsky_text2img(self):
        expected_image = load_numpy(
@@ -306,7 +307,7 @@ class KandinskyPipelineIntegrationTests(unittest.TestCase):

        prompt = "red cat, 4k photo"

-        generator = torch.Generator(device="cuda").manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
        image_emb, zero_image_emb = pipe_prior(
            prompt,
            generator=generator,
@@ -314,7 +315,7 @@ class KandinskyPipelineIntegrationTests(unittest.TestCase):
            negative_prompt="",
        ).to_tuple()

-        generator = torch.Generator(device="cuda").manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
        output = pipeline(
            prompt,
            image_embeds=image_emb,

--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -18,7 +18,7 @@ import unittest
 import numpy as np

 from diffusers import KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyInpaintCombinedPipeline
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device

 from ..test_pipelines_common import PipelineTesterMixin
 from .test_kandinsky import Dummies
@@ -105,7 +105,7 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -114,12 +114,12 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -213,7 +213,7 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -222,12 +222,12 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -325,7 +325,7 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -334,12 +334,12 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []

--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -32,12 +32,13 @@ from diffusers import (
 )
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -267,7 +268,7 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -299,19 +300,19 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinsky_img2img(self):
        expected_image = load_numpy(
@@ -365,19 +366,19 @@ class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class KandinskyImg2ImgPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinsky_img2img_ddpm(self):
        expected_image = load_numpy(

--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -25,12 +25,13 @@ from transformers import XLMRobertaTokenizerFast
 from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
-    require_torch_gpu,
+    require_torch_accelerator,
    torch_device,
 )

@@ -265,7 +266,7 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def test_inference_batch_single_identical(self):
        super().test_inference_batch_single_identical(expected_max_diff=3e-3)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -274,12 +275,12 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -297,19 +298,19 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinsky_inpaint(self):
        expected_image = load_numpy(

--- a/tests/pipelines/kandinsky2_2/test_kandinsky.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky.py
@@ -22,12 +22,14 @@ import torch

 from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_numpy,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
+    torch_device,
 )

 from ..test_pipelines_common import PipelineTesterMixin
@@ -221,19 +223,19 @@ class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinsky_text2img(self):
        expected_image = load_numpy(
@@ -244,12 +246,12 @@ class KandinskyV22PipelineIntegrationTests(unittest.TestCase):
        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
        )
-        pipe_prior.enable_model_cpu_offload()
+        pipe_prior.enable_model_cpu_offload(device=torch_device)

        pipeline = KandinskyV22Pipeline.from_pretrained(
            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)

        prompt = "red cat, 4k photo"

--- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -22,7 +22,7 @@ from diffusers import (
    KandinskyV22Img2ImgCombinedPipeline,
    KandinskyV22InpaintCombinedPipeline,
 )
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device

 from ..test_pipelines_common import PipelineTesterMixin
 from .test_kandinsky import Dummies
@@ -110,7 +110,7 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -119,12 +119,12 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -234,7 +234,7 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -243,12 +243,12 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -357,7 +357,7 @@ class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -366,12 +366,12 @@ class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []

--- a/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py
@@ -29,13 +29,15 @@ from diffusers import (
    VQModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
+    torch_device,
 )

 from ..test_pipelines_common import PipelineTesterMixin
@@ -238,19 +240,19 @@ class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinsky_img2img(self):
        expected_image = load_numpy(
@@ -266,12 +268,12 @@ class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase):
        pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
            "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
        )
-        pipe_prior.enable_model_cpu_offload()
+        pipe_prior.enable_model_cpu_offload(device=torch_device)

        pipeline = KandinskyV22Img2ImgPipeline.from_pretrained(
            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)

--- a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
@@ -29,13 +29,14 @@ from diffusers import (
    VQModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    is_flaky,
    load_image,
    load_numpy,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -292,19 +293,19 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinsky_inpaint(self):
        expected_image = load_numpy(

--- a/tests/pipelines/kandinsky3/test_kandinsky3.py
+++ b/tests/pipelines/kandinsky3/test_kandinsky3.py
@@ -31,10 +31,12 @@ from diffusers import (
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
+    torch_device,
 )

 from ..pipeline_params import (
@@ -167,25 +169,25 @@ class Kandinsky3PipelineFastTests(PipelineTesterMixin, unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class Kandinsky3PipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinskyV3(self):
        pipe = AutoPipelineForText2Image.from_pretrained(
            "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
@@ -211,7 +213,7 @@ class Kandinsky3PipelineIntegrationTests(unittest.TestCase):
        pipe = AutoPipelineForImage2Image.from_pretrained(
            "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)

--- a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py
+++ b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py
@@ -31,10 +31,11 @@ from diffusers import (
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -192,25 +193,25 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class Kandinsky3Img2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_kandinskyV3_img2img(self):
        pipe = AutoPipelineForImage2Image.from_pretrained(
            "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)