[Tests] Correct PT2 (#2724)

* [Tests] Correct PT2 * correct more * move versatile to nightly * up * up * again * Apply suggestions from code review

[Tests] Correct PT2 (#2724)
* [Tests] Correct PT2 * correct more * move versatile to nightly * up * up * again * Apply suggestions from code review
9ecd9248 · Patrick von Platen · GitHub · 116f70cb · 9ecd9248 · 9ecd9248
Unverified Commit 9ecd9248 authored Mar 18, 2023 by Patrick von Platen Committed by GitHub Mar 18, 2023
14 changed files
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -15,7 +15,6 @@
 import gc
 import math
-import tracemalloc
 import unittest
 import torch
@@ -155,33 +154,6 @@ class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
        assert torch_all_close(arr_accelerate, arr_normal_load, rtol=1e-3)
-    @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
-    def test_memory_footprint_gets_reduced(self):
-        torch.cuda.empty_cache()
-        gc.collect()
-        tracemalloc.start()
-        # by defautl model loading will use accelerate as `low_cpu_mem_usage=True`
-        model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
-        model_accelerate.to(torch_device)
-        model_accelerate.eval()
-        _, peak_accelerate = tracemalloc.get_traced_memory()
-        del model_accelerate
-        torch.cuda.empty_cache()
-        gc.collect()
-        model_normal_load, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=False
-        )
-        model_normal_load.to(torch_device)
-        model_normal_load.eval()
-        _, peak_normal = tracemalloc.get_traced_memory()
-        tracemalloc.stop()
-        assert peak_accelerate < peak_normal
    def test_output_pretrained(self):
        model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update")
        model.eval()

--- a/tests/pipelines/dit/test_dit.py
+++ b/tests/pipelines/dit/test_dit.py
@@ -125,8 +125,8 @@ class DiTPipelineIntegrationTests(unittest.TestCase):
            )
            assert np.abs((expected_image - image).max()) < 1e-3
-    def test_dit_512_fp16(self):
+    def test_dit_512(self):
-        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512", torch_dtype=torch.float16)
+        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512")
        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
        pipe.to("cuda")
@@ -139,7 +139,7 @@ class DiTPipelineIntegrationTests(unittest.TestCase):
        for word, image in zip(words, images):
            expected_image = load_numpy(
                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-                f"/dit/{word}_fp16.npy"
+                f"/dit/{word}_512.npy"
            )
-            assert np.abs((expected_image - image).max()) < 7.5e-1
+            assert np.abs((expected_image - image).max()) < 1e-1
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -118,7 +118,6 @@ class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
        init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
        ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto")
-        ldm.to(torch_device)
        ldm.set_progress_bar_config(disable=None)
        generator = torch.manual_seed(0)

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -35,6 +35,7 @@ from diffusers import (
    UNet2DConditionModel,
    logging,
 )
+from diffusers.models.attention_processor import AttnProcessor
 from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
@@ -698,7 +699,6 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        torch.cuda.reset_peak_memory_stats()
        model_id = "CompVis/stable-diffusion-v1-4"
        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
-        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()
        pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
@@ -708,42 +708,36 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        # enable vae tiling
        pipe.enable_vae_tiling()
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        pipe.enable_model_cpu_offload()
-        with torch.autocast(torch_device):
+        generator = torch.Generator(device="cpu").manual_seed(0)
-            output_chunked = pipe(
+        output_chunked = pipe(
-                [prompt],
+            [prompt],
-                width=640,
+            width=1024,
-                height=640,
+            height=1024,
-                generator=generator,
+            generator=generator,
-                guidance_scale=7.5,
+            guidance_scale=7.5,
-                num_inference_steps=2,
+            num_inference_steps=2,
-                output_type="numpy",
+            output_type="numpy",
-            )
+        )
-            image_chunked = output_chunked.images
+        image_chunked = output_chunked.images
        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 4 GB is allocated
-        assert mem_bytes < 4e9
        # disable vae tiling
        pipe.disable_vae_tiling()
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        generator = torch.Generator(device="cpu").manual_seed(0)
-        with torch.autocast(torch_device):
+        output = pipe(
-            output = pipe(
+            [prompt],
-                [prompt],
+            width=1024,
-                width=640,
+            height=1024,
-                height=640,
+            generator=generator,
-                generator=generator,
+            guidance_scale=7.5,
-                guidance_scale=7.5,
+            num_inference_steps=2,
-                num_inference_steps=2,
+            output_type="numpy",
-                output_type="numpy",
+        )
-            )
+        image = output.images
-            image = output.images
-        # make sure that more than 4 GB is allocated
+        assert mem_bytes < 1e10
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 5e9
        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-2
    def test_stable_diffusion_fp16_vs_autocast(self):
@@ -849,6 +843,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
            "CompVis/stable-diffusion-v1-4",
            torch_dtype=torch.float16,
        )
+        pipe.unet.set_attn_processor(AttnProcessor())
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        outputs = pipe(**inputs)
@@ -861,6 +856,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
            "CompVis/stable-diffusion-v1-4",
            torch_dtype=torch.float16,
        )
+        pipe.unet.set_attn_processor(AttnProcessor())
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
@@ -868,6 +864,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        pipe.enable_model_cpu_offload()
        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        outputs_offloaded = pipe(**inputs)
        mem_bytes_offloaded = torch.cuda.max_memory_allocated()

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -214,7 +214,7 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
        image_slice = image[0, 253:256, 253:256, -1].flatten()
        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1443, 0.1218, 0.1587, 0.1594, 0.1411, 0.1284, 0.1370, 0.1506, 0.2339])
+        expected_slice = np.array([0.1350, 0.1123, 0.1350, 0.1641, 0.1328, 0.1230, 0.1289, 0.1531, 0.1687])
        assert np.abs(expected_slice - image_slice).max() < 5e-2

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -339,4 +339,4 @@ class StableDiffusionPanoramaSlowTests(unittest.TestCase):
        mem_bytes = torch.cuda.max_memory_allocated()
        # make sure that less than 5.2 GB is allocated
-        assert mem_bytes < 5.2 * 10**9
+        assert mem_bytes < 5.5 * 10**9
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -361,7 +361,7 @@ class InversionPipelineSlowTests(unittest.TestCase):
        image_slice = inv_latents[0, -3:, -3:, -1].flatten()
        assert inv_latents.shape == (1, 4, 64, 64)
-        expected_slice = np.array([0.8877, 0.0587, 0.7700, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498, -0.8599])
+        expected_slice = np.array([0.8447, -0.0730, 0.7588, -1.2070, -0.4678, 0.1511, -0.8555, 1.1816, -0.7666])
        assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2
@@ -383,7 +383,7 @@ class InversionPipelineSlowTests(unittest.TestCase):
        image_slice = inv_latents[0, -3:, -3:, -1].flatten()
        assert inv_latents.shape == (1, 4, 64, 64)
-        expected_slice = np.array([0.7515, -0.2397, 0.4922, -0.9736, -0.7031, 0.4846, -1.0781, 1.1309, -0.6973])
+        expected_slice = np.array([0.8970, -0.1611, 0.4766, -1.1162, -0.5923, 0.1050, -0.9678, 1.0537, -0.6050])
        assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2

--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -32,6 +32,7 @@ from diffusers import (
    UNet2DConditionModel,
    logging,
 )
+from diffusers.models.attention_processor import AttnProcessor
 from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
@@ -409,6 +410,7 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
            "stabilityai/stable-diffusion-2-base",
            torch_dtype=torch.float16,
        )
+        pipe.unet.set_attn_processor(AttnProcessor())
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        outputs = pipe(**inputs)
@@ -421,6 +423,7 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
            "stabilityai/stable-diffusion-2-base",
            torch_dtype=torch.float16,
        )
+        pipe.unet.set_attn_processor(AttnProcessor())
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
@@ -428,6 +431,7 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
        pipe.enable_model_cpu_offload()
        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        outputs_offloaded = pipe(**inputs)
        mem_bytes_offloaded = torch.cuda.max_memory_allocated()

--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -358,5 +358,5 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
        )
        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.65 GB is allocated
+        # make sure that less than 2.9 GB is allocated
-        assert mem_bytes < 2.65 * 10**9
+        assert mem_bytes < 2.9 * 10**9
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
@@ -21,17 +21,13 @@ import numpy as np
 import torch
 from diffusers import VersatileDiffusionDualGuidedPipeline
-from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device
 torch.backends.cuda.matmul.allow_tf32 = False
-class VersatileDiffusionDualGuidedPipelineFastTests(unittest.TestCase):
+@nightly
-    pass
-@slow
 @require_torch_gpu
 class VersatileDiffusionDualGuidedPipelineIntegrationTests(unittest.TestCase):
    def tearDown(self):

--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
@@ -21,7 +21,7 @@ import numpy as np
 import torch
 from diffusers import VersatileDiffusionPipeline
-from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -31,7 +31,7 @@ class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase):
    pass
-@slow
+@nightly
 @require_torch_gpu
 class VersatileDiffusionMegaPipelineIntegrationTests(unittest.TestCase):
    def tearDown(self):

--- a/tests/test_ema.py
+++ b/tests/test_ema.py
@@ -153,4 +153,4 @@ class EMAModelTests(unittest.TestCase):
        output = unet(noisy_latents, timesteps, encoder_hidden_states).sample
        output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample
-        assert torch.allclose(output, output_loaded)
+        assert torch.allclose(output, output_loaded, atol=1e-4)
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -25,6 +25,7 @@ import torch
 from requests.exceptions import HTTPError
 from diffusers.models import ModelMixin, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor
 from diffusers.training_utils import EMAModel
 from diffusers.utils import torch_device
@@ -105,12 +106,16 @@ class ModelTesterMixin:
        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**init_dict)
+        if hasattr(model, "set_attn_processor"):
+            model.set_attn_processor(AttnProcessor())
        model.to(torch_device)
        model.eval()
        with tempfile.TemporaryDirectory() as tmpdirname:
            model.save_pretrained(tmpdirname)
            new_model = self.model_class.from_pretrained(tmpdirname)
+            if hasattr(new_model, "set_attn_processor"):
+                new_model.set_attn_processor(AttnProcessor())
            new_model.to(torch_device)
        with torch.no_grad():
@@ -135,12 +140,16 @@ class ModelTesterMixin:
        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**init_dict)
+        if hasattr(model, "set_attn_processor"):
+            model.set_attn_processor(AttnProcessor())
        model.to(torch_device)
        model.eval()
        with tempfile.TemporaryDirectory() as tmpdirname:
            model.save_pretrained(tmpdirname, variant="fp16")
            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
+            if hasattr(new_model, "set_attn_processor"):
+                new_model.set_attn_processor(AttnProcessor())
            # non-variant cannot be loaded
            with self.assertRaises(OSError) as error_context:

--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -1123,7 +1123,7 @@ class PipelineSlowTests(unittest.TestCase):
                f"/compel/forest_{i}.npy"
            )
-            assert np.abs(image - expected_image).max() < 1e-3
+            assert np.abs(image - expected_image).max() < 1e-2
 @nightly