[tests] make tests device-agnostic (part 3) (#10437)

* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * bug fix * Apply suggestions from code review --------- Co-authored-by: hlky <hlky@hlky.ac>

[tests] make tests device-agnostic (part 3) (#10437)
* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * bug fix * Apply suggestions from code review --------- Co-authored-by: hlky <hlky@hlky.ac>
ec37e209 · Fanli Lin · GitHub · 158a5a87 · ec37e209 · ec37e209
Unverified Commit ec37e209 authored Jan 21, 2025 by Fanli Lin Committed by GitHub Jan 21, 2025
6 changed files
--- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -23,11 +23,15 @@ from diffusers import IFInpaintingPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -106,30 +110,30 @@ class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFInpaintingPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_inpainting(self):
        pipe = IFInpaintingPipeline.from_pretrained(
            "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
        mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device)
@@ -145,7 +149,7 @@ class IFInpaintingPipelineSlowTests(unittest.TestCase):
        )
        image = output.images[0]

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes < 12 * 10**9

        expected_image = load_numpy(

--- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -23,11 +23,15 @@ from diffusers import IFInpaintingSuperResolutionPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -108,31 +112,31 @@ class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipeli


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_inpainting_superresolution(self):
        pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        # Super resolution test
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        generator = torch.Generator(device="cpu").manual_seed(0)

@@ -154,7 +158,7 @@ class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase):

        assert image.shape == (256, 256, 3)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes < 12 * 10**9

        expected_image = load_numpy(

--- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -23,11 +23,15 @@ from diffusers import IFSuperResolutionPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -101,31 +105,31 @@ class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMi


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFSuperResolutionPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_superresolution(self):
        pipe = IFSuperResolutionPipeline.from_pretrained(
            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        # Super resolution test
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -141,7 +145,7 @@ class IFSuperResolutionPipelineSlowTests(unittest.TestCase):

        assert image.shape == (256, 256, 3)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes < 12 * 10**9

        expected_image = load_numpy(

--- a/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
+++ b/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
@@ -30,7 +30,7 @@ from diffusers import (
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -299,7 +299,7 @@ class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class HunyuanDiTPipelineIntegrationTests(unittest.TestCase):
    prompt = "一个宇航员在骑马"

@@ -319,7 +319,7 @@ class HunyuanDiTPipelineIntegrationTests(unittest.TestCase):
        pipe = HunyuanDiTPipeline.from_pretrained(
            "XCLiu/HunyuanDiT-0523", revision="refs/pr/2", torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt

        image = pipe(

--- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
+++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
@@ -36,10 +36,11 @@ from diffusers import (
 from diffusers.models.unets import I2VGenXLUNet
 from diffusers.utils import is_xformers_available, load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    slow,
    torch_device,
@@ -228,23 +229,23 @@ class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unit


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class I2VGenXLPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_i2vgen_xl(self):
        pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)
        image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"

--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -66,6 +66,7 @@ from diffusers.utils import (
 )
 from diffusers.utils.testing_utils import (
    CaptureLogger,
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    get_python_version,
@@ -78,7 +79,7 @@ from diffusers.utils.testing_utils import (
    require_hf_hub_version_greater,
    require_onnxruntime,
    require_torch_2,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    run_test_in_subprocess,
    slow,
@@ -1150,7 +1151,7 @@ class CustomPipelineTests(unittest.TestCase):
        assert conf_1 == conf_2

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    def test_download_from_git(self):
        # Because adaptive_avg_pool2d_backward_cuda
        # does not have a deterministic implementation.
@@ -1364,7 +1365,7 @@ class PipelineFastTests(unittest.TestCase):
        assert image_img2img.shape == (1, 32, 32, 3)
        assert image_text2img.shape == (1, 64, 64, 3)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_pipe_false_offload_warn(self):
        unet = self.dummy_cond_unet()
        scheduler = PNDMScheduler(skip_prk_steps=True)
@@ -1898,19 +1899,19 @@ class PipelineFastTests(unittest.TestCase):


 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class PipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_smart_download(self):
        model_id = "hf-internal-testing/unet-pipeline-dummy"
@@ -2102,7 +2103,7 @@ class PipelineSlowTests(unittest.TestCase):

        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.enable_attention_slicing()

        compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
@@ -2129,19 +2130,19 @@ class PipelineSlowTests(unittest.TestCase):


 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class PipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_ddpm_ddim_equality_batched(self):
        seed = 0