[tests] make tests device-agnostic (part 4) (#10508)

* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * add changes * bug fix * enable on xpu * update more cases * revert * revert back * Update test_stable_diffusion_xl.py * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Apply suggestions from code review Co-authored-by: hlky <hlky@hlky.ac> * add test marker --------- Co-authored-by: hlky <hlky@hlky.ac>

[tests] make tests device-agnostic (part 4) (#10508)
* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * add changes * bug fix * enable on xpu * update more cases * revert * revert back * Update test_stable_diffusion_xl.py * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> * Apply suggestions from code review Co-authored-by: hlky <hlky@hlky.ac> * add test marker --------- Co-authored-by: hlky <hlky@hlky.ac>
7855ac59 · Fanli Lin · GitHub · 30cef6bf · 7855ac59 · 7855ac59
Unverified Commit 7855ac59 authored Mar 04, 2025 by Fanli Lin Committed by GitHub Mar 04, 2025
20 changed files
--- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py
+++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py
@@ -13,8 +13,9 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -222,11 +223,11 @@ class LatentConsistencyModelPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class LatentConsistencyModelPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)

--- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py
+++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py
@@ -14,10 +14,11 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -229,11 +230,11 @@ class LatentConsistencyModelImg2ImgPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class LatentConsistencyModelImg2ImgPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)

--- a/tests/pipelines/latte/test_latte.py
+++ b/tests/pipelines/latte/test_latte.py
@@ -30,9 +30,10 @@ from diffusers import (
 )
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -218,25 +219,25 @@ class LattePipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTeste
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class LattePipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_latte(self):
        generator = torch.Generator("cpu").manual_seed(0)
        pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        videos = pipe(

--- a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py
+++ b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py
@@ -29,10 +29,11 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    slow,
    torch_device,
@@ -202,17 +203,17 @@ class LEditsPPPipelineStableDiffusionFastTests(unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class LEditsPPPipelineStableDiffusionSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    @classmethod
    def setUpClass(cls):

--- a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py
+++ b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py
@@ -41,7 +41,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    slow,
    torch_device,
@@ -253,7 +253,7 @@ class LEditsPPPipelineStableDiffusionXLFastTests(unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class LEditsPPPipelineStableDiffusionXLSlowTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):

--- a/tests/pipelines/lumina/test_lumina_nextdit.py
+++ b/tests/pipelines/lumina/test_lumina_nextdit.py
@@ -7,8 +7,9 @@ from transformers import AutoTokenizer, GemmaConfig, GemmaForCausalLM
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, LuminaNextDiT2DModel, LuminaText2ImgPipeline
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -100,7 +101,7 @@ class LuminaText2ImgPipelinePipelineFastTests(unittest.TestCase, PipelineTesterM
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class LuminaText2ImgPipelineSlowTests(unittest.TestCase):
    pipeline_class = LuminaText2ImgPipeline
    repo_id = "Alpha-VLLM/Lumina-Next-SFT-diffusers"
@@ -108,12 +109,12 @@ class LuminaText2ImgPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
@@ -131,7 +132,7 @@ class LuminaText2ImgPipelineSlowTests(unittest.TestCase):
    def test_lumina_inference(self):
        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        inputs = self.get_inputs(torch_device)

--- a/tests/pipelines/marigold/test_marigold_depth.py
+++ b/tests/pipelines/marigold/test_marigold_depth.py
@@ -32,12 +32,14 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    is_flaky,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
+    torch_device,
 )
 from ..test_pipelines_common import PipelineTesterMixin
@@ -288,17 +290,17 @@ class MarigoldDepthPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def _test_marigold_depth(
        self,
@@ -317,8 +319,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
            from_pretrained_kwargs["torch_dtype"] = torch.float16
        pipe = MarigoldDepthPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
-        if device == "cuda":
+        pipe.enable_model_cpu_offload(device=torch_device)
-            pipe.enable_model_cpu_offload()
        pipe.set_progress_bar_config(disable=None)
        generator = torch.Generator(device=device).manual_seed(generator_seed)
@@ -358,7 +359,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
        self._test_marigold_depth(
            is_fp16=False,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.1244, 0.1265, 0.1292, 0.1240, 0.1252, 0.1266, 0.1246, 0.1226, 0.1180]),
            num_inference_steps=1,
@@ -371,7 +372,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
        self._test_marigold_depth(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.1241, 0.1262, 0.1290, 0.1238, 0.1250, 0.1265, 0.1244, 0.1225, 0.1179]),
            num_inference_steps=1,
@@ -384,7 +385,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
        self._test_marigold_depth(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=2024,
            expected_slice=np.array([0.1710, 0.1725, 0.1738, 0.1700, 0.1700, 0.1696, 0.1698, 0.1663, 0.1592]),
            num_inference_steps=1,
@@ -397,7 +398,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
        self._test_marigold_depth(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]),
            num_inference_steps=2,
@@ -410,7 +411,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
        self._test_marigold_depth(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.2683, 0.2693, 0.2698, 0.2666, 0.2632, 0.2615, 0.2656, 0.2603, 0.2573]),
            num_inference_steps=1,
@@ -423,7 +424,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
        self._test_marigold_depth(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.1200, 0.1215, 0.1237, 0.1193, 0.1197, 0.1202, 0.1196, 0.1166, 0.1109]),
            num_inference_steps=1,
@@ -437,7 +438,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
        self._test_marigold_depth(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.1121, 0.1135, 0.1155, 0.1111, 0.1115, 0.1118, 0.1111, 0.1079, 0.1019]),
            num_inference_steps=1,
@@ -451,7 +452,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
        self._test_marigold_depth(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.2671, 0.2690, 0.2720, 0.2659, 0.2676, 0.2739, 0.2664, 0.2686, 0.2573]),
            num_inference_steps=1,

--- a/tests/pipelines/marigold/test_marigold_normals.py
+++ b/tests/pipelines/marigold/test_marigold_normals.py
@@ -32,11 +32,13 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
+    torch_device,
 )
 from ..test_pipelines_common import PipelineTesterMixin
@@ -285,17 +287,17 @@ class MarigoldNormalsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def _test_marigold_normals(
        self,
@@ -314,8 +316,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
            from_pretrained_kwargs["torch_dtype"] = torch.float16
        pipe = MarigoldNormalsPipeline.from_pretrained(model_id, **from_pretrained_kwargs)
-        if device == "cuda":
+        pipe.enable_model_cpu_offload(device=torch_device)
-            pipe.enable_model_cpu_offload()
        pipe.set_progress_bar_config(disable=None)
        generator = torch.Generator(device=device).manual_seed(generator_seed)
@@ -342,7 +343,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self):
        self._test_marigold_normals(
            is_fp16=False,
-            device="cpu",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971]),
            num_inference_steps=1,
@@ -355,7 +356,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
        self._test_marigold_normals(
            is_fp16=False,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.7980, 0.7952, 0.7914, 0.7931, 0.7871, 0.7816, 0.7844, 0.7710, 0.7601]),
            num_inference_steps=1,
@@ -368,7 +369,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
        self._test_marigold_normals(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.7979, 0.7949, 0.7915, 0.7930, 0.7871, 0.7817, 0.7842, 0.7710, 0.7603]),
            num_inference_steps=1,
@@ -381,7 +382,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
        self._test_marigold_normals(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=2024,
            expected_slice=np.array([0.8428, 0.8428, 0.8433, 0.8369, 0.8325, 0.8315, 0.8271, 0.8135, 0.8057]),
            num_inference_steps=1,
@@ -394,7 +395,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
        self._test_marigold_normals(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.7095, 0.7095, 0.7104, 0.7070, 0.7051, 0.7061, 0.7017, 0.6938, 0.6914]),
            num_inference_steps=2,
@@ -407,7 +408,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
        self._test_marigold_normals(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.7168, 0.7163, 0.7163, 0.7080, 0.7061, 0.7046, 0.7031, 0.7007, 0.6987]),
            num_inference_steps=1,
@@ -420,7 +421,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
        self._test_marigold_normals(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.7114, 0.7124, 0.7144, 0.7085, 0.7070, 0.7080, 0.7051, 0.6958, 0.6924]),
            num_inference_steps=1,
@@ -434,7 +435,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
        self._test_marigold_normals(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.7412, 0.7441, 0.7490, 0.7383, 0.7388, 0.7437, 0.7329, 0.7271, 0.7300]),
            num_inference_steps=1,
@@ -448,7 +449,7 @@ class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase):
    def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
        self._test_marigold_normals(
            is_fp16=True,
-            device="cuda",
+            device=torch_device,
            generator_seed=0,
            expected_slice=np.array([0.7188, 0.7144, 0.7134, 0.7178, 0.7207, 0.7222, 0.7231, 0.7041, 0.6987]),
            num_inference_steps=1,

--- a/tests/pipelines/mochi/test_mochi.py
+++ b/tests/pipelines/mochi/test_mochi.py
@@ -23,6 +23,7 @@ from transformers import AutoTokenizer, T5EncoderModel
 from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    nightly,
    numpy_cosine_similarity_distance,
@@ -274,18 +275,18 @@ class MochiPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_mochi(self):
        generator = torch.Generator("cpu").manual_seed(0)
        pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        videos = pipe(

--- a/tests/pipelines/pag/test_pag_sd.py
+++ b/tests/pipelines/pag/test_pag_sd.py
@@ -30,8 +30,9 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -285,7 +286,7 @@ class StableDiffusionPAGPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    pipeline_class = StableDiffusionPAGPipeline
    repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
@@ -293,12 +294,12 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", seed=1, guidance_scale=7.0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -315,7 +316,7 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    def test_pag_cfg(self):
        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device)
@@ -333,7 +334,7 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    def test_pag_uncond(self):
        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device, guidance_scale=0.0)

--- a/tests/pipelines/pag/test_pag_sd3_img2img.py
+++ b/tests/pipelines/pag/test_pag_sd3_img2img.py
@@ -16,10 +16,11 @@ from diffusers import (
    StableDiffusion3PAGImg2ImgPipeline,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -193,7 +194,7 @@ class StableDiffusion3PAGImg2ImgPipelineFastTests(unittest.TestCase, PipelineTes
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    pipeline_class = StableDiffusion3PAGImg2ImgPipeline
    repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
@@ -201,12 +202,12 @@ class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(
        self, device, generator_device="cpu", dtype=torch.float32, seed=0, guidance_scale=7.0, pag_scale=0.7
@@ -233,7 +234,7 @@ class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
        pipeline = AutoPipelineForImage2Image.from_pretrained(
            self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.17"]
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device)

--- a/tests/pipelines/pag/test_pag_sd_img2img.py
+++ b/tests/pipelines/pag/test_pag_sd_img2img.py
@@ -32,10 +32,11 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -219,7 +220,7 @@ class StableDiffusionPAGImg2ImgPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    pipeline_class = StableDiffusionPAGImg2ImgPipeline
    repo_id = "Jiali/stable-diffusion-1.5"
@@ -227,12 +228,12 @@ class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -254,7 +255,7 @@ class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def test_pag_cfg(self):
        pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device)
@@ -272,7 +273,7 @@ class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def test_pag_uncond(self):
        pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device, guidance_scale=0.0)

--- a/tests/pipelines/pag/test_pag_sd_inpaint.py
+++ b/tests/pipelines/pag/test_pag_sd_inpaint.py
@@ -30,10 +30,11 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -251,7 +252,7 @@ class StableDiffusionPAGInpaintPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    pipeline_class = StableDiffusionPAGInpaintPipeline
    repo_id = "runwayml/stable-diffusion-v1-5"
@@ -259,12 +260,12 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
        img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
@@ -289,7 +290,7 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    def test_pag_cfg(self):
        pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device)
@@ -307,7 +308,7 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase):
    def test_pag_uncond(self):
        pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device, guidance_scale=0.0)

--- a/tests/pipelines/pag/test_pag_sdxl.py
+++ b/tests/pipelines/pag/test_pag_sdxl.py
@@ -30,8 +30,9 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -289,7 +290,7 @@ class StableDiffusionXLPAGPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase):
    pipeline_class = StableDiffusionXLPAGPipeline
    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -297,12 +298,12 @@ class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -319,7 +320,7 @@ class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase):
    def test_pag_cfg(self):
        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device)
@@ -336,7 +337,7 @@ class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase):
    def test_pag_uncond(self):
        pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device, guidance_scale=0.0)

--- a/tests/pipelines/pag/test_pag_sdxl_img2img.py
+++ b/tests/pipelines/pag/test_pag_sdxl_img2img.py
@@ -39,10 +39,11 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -268,19 +269,19 @@ class StableDiffusionXLPAGImg2ImgPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionXLPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
        img_url = (
@@ -304,7 +305,7 @@ class StableDiffusionXLPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def test_pag_cfg(self):
        pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device)
@@ -321,7 +322,7 @@ class StableDiffusionXLPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def test_pag_uncond(self):
        pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device, guidance_scale=0.0)

--- a/tests/pipelines/pag/test_pag_sdxl_inpaint.py
+++ b/tests/pipelines/pag/test_pag_sdxl_inpaint.py
@@ -40,10 +40,11 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -273,19 +274,19 @@ class StableDiffusionXLPAGInpaintPipelineFastTests(
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionXLPAGInpaintPipelineIntegrationTests(unittest.TestCase):
    repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
        img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
@@ -310,7 +311,7 @@ class StableDiffusionXLPAGInpaintPipelineIntegrationTests(unittest.TestCase):
    def test_pag_cfg(self):
        pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device)
@@ -327,7 +328,7 @@ class StableDiffusionXLPAGInpaintPipelineIntegrationTests(unittest.TestCase):
    def test_pag_uncond(self):
        pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)
        inputs = self.get_inputs(torch_device, guidance_scale=0.0)

--- a/tests/pipelines/pixart_alpha/test_pixart.py
+++ b/tests/pipelines/pixart_alpha/test_pixart.py
@@ -28,9 +28,10 @@ from diffusers import (
    PixArtTransformer2DModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -254,7 +255,7 @@ class PixArtAlphaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):
    ckpt_id_1024 = "PixArt-alpha/PixArt-XL-2-1024-MS"
    ckpt_id_512 = "PixArt-alpha/PixArt-XL-2-512x512"
@@ -263,18 +264,18 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_pixart_1024(self):
        generator = torch.Generator("cpu").manual_seed(0)
        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
@@ -289,7 +290,7 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):
        generator = torch.Generator("cpu").manual_seed(0)
        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
@@ -305,7 +306,7 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):
        generator = torch.manual_seed(0)
        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        height, width = 1024, 768
@@ -339,7 +340,7 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):
        generator = torch.manual_seed(0)
        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        height, width = 512, 768

--- a/tests/pipelines/pixart_sigma/test_pixart.py
+++ b/tests/pipelines/pixart_sigma/test_pixart.py
@@ -28,9 +28,10 @@ from diffusers import (
    PixArtTransformer2DModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -283,7 +284,7 @@ class PixArtSigmaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
    ckpt_id_1024 = "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS"
    ckpt_id_512 = "PixArt-alpha/PixArt-Sigma-XL-2-512-MS"
@@ -292,18 +293,18 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_pixart_1024(self):
        generator = torch.Generator("cpu").manual_seed(0)
        pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
@@ -323,7 +324,7 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
        pipe = PixArtSigmaPipeline.from_pretrained(
            self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
@@ -339,7 +340,7 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
        generator = torch.manual_seed(0)
        pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        height, width = 1024, 768
@@ -378,7 +379,7 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
        pipe = PixArtSigmaPipeline.from_pretrained(
            self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt
        height, width = 512, 768

--- a/tests/pipelines/sana/test_sana.py
+++ b/tests/pipelines/sana/test_sana.py
@@ -22,8 +22,9 @@ from transformers import Gemma2Config, Gemma2Model, GemmaTokenizer
 from diffusers import AutoencoderDC, FlowMatchEulerDiscreteScheduler, SanaPipeline, SanaTransformer2DModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -305,19 +306,19 @@ class SanaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class SanaPipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_sana_1024(self):
        generator = torch.Generator("cpu").manual_seed(0)
@@ -325,7 +326,7 @@ class SanaPipelineIntegrationTests(unittest.TestCase):
        pipe = SanaPipeline.from_pretrained(
            "Efficient-Large-Model/Sana_1600M_1024px_diffusers", torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        image = pipe(
            prompt=self.prompt,
@@ -351,7 +352,7 @@ class SanaPipelineIntegrationTests(unittest.TestCase):
        pipe = SanaPipeline.from_pretrained(
            "Efficient-Large-Model/Sana_1600M_512px_diffusers", torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        image = pipe(
            prompt=self.prompt,

--- a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
+++ b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
@@ -22,7 +22,7 @@ from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokeni
 from diffusers import DDPMWuerstchenScheduler, StableCascadeCombinedPipeline
 from diffusers.models import StableCascadeUNet
 from diffusers.pipelines.wuerstchen import PaellaVQModel
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device
 from ..test_pipelines_common import PipelineTesterMixin
@@ -205,7 +205,7 @@ class StableCascadeCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestC
            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
-    @require_torch_gpu
+    @require_torch_accelerator
    def test_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -214,12 +214,12 @@ class StableCascadeCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestC
        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)
        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)
        image_slices = []