[tests] speed up animatediff tests (#8846)

* speed up animatediff tests * fix pia test_ip_adapter_single * fix tests/pipelines/pia/test_pia.py::PIAPipelineFastTests::test_dict_tuple_outputs_equivalent * update * fix ip adapter tests * skip test_from_pipe_consistent_config tests * fix prompt_embeds test * update test_from_pipe_consistent_config tests * fix expected_slice values * remove temporal_norm_num_groups from UpBlockMotion --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>

[tests] speed up animatediff tests (#8846)
* speed up animatediff tests * fix pia test_ip_adapter_single * fix tests/pipelines/pia/test_pia.py::PIAPipelineFastTests::test_dict_tuple_outputs_equivalent * update * fix ip adapter tests * skip test_from_pipe_consistent_config tests * fix prompt_embeds test * update test_from_pipe_consistent_config tests * fix expected_slice values * remove temporal_norm_num_groups from UpBlockMotion --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
3ae0ee88 · Aryan · GitHub · 5fbb4d32 · 3ae0ee88 · 3ae0ee88
Unverified Commit 3ae0ee88 authored Jul 25, 2024 by Aryan Committed by GitHub Jul 25, 2024
4 changed files
--- a/src/diffusers/models/unets/unet_3d_blocks.py
+++ b/src/diffusers/models/unets/unet_3d_blocks.py
@@ -1532,7 +1532,6 @@ class UpBlockMotion(nn.Module):
        resnet_pre_norm: bool = True,
        output_scale_factor: float = 1.0,
        add_upsample: bool = True,
-        temporal_norm_num_groups: int = 32,
        temporal_cross_attention_dim: Optional[int] = None,
        temporal_num_attention_heads: int = 8,
        temporal_max_seq_length: int = 32,
@@ -1574,7 +1573,7 @@ class UpBlockMotion(nn.Module):
                    num_attention_heads=temporal_num_attention_heads,
                    in_channels=out_channels,
                    num_layers=temporal_transformer_layers_per_block[i],
-                    norm_num_groups=temporal_norm_num_groups,
+                    norm_num_groups=resnet_groups,
                    cross_attention_dim=temporal_cross_attention_dim,
                    attention_bias=False,
                    activation_fn="geglu",

--- a/tests/pipelines/animatediff/test_animatediff.py
+++ b/tests/pipelines/animatediff/test_animatediff.py
@@ -11,6 +11,7 @@ from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
    MotionAdapter,
+    StableDiffusionPipeline,
    UNet2DConditionModel,
    UNetMotionModel,
 )
@@ -51,16 +52,19 @@ class AnimateDiffPipelineFastTests(
    )

    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
        torch.manual_seed(0)
        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
+            block_out_channels=block_out_channels,
            layers_per_block=2,
-            sample_size=32,
+            sample_size=8,
            in_channels=4,
            out_channels=4,
            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
+            cross_attention_dim=cross_attention_dim,
            norm_num_groups=2,
        )
        scheduler = DDIMScheduler(
@@ -71,18 +75,19 @@ class AnimateDiffPipelineFastTests(
        )
        torch.manual_seed(0)
        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
+            block_out_channels=block_out_channels,
            in_channels=3,
            out_channels=3,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
+            norm_num_groups=2,
        )
        torch.manual_seed(0)
        text_encoder_config = CLIPTextConfig(
            bos_token_id=0,
            eos_token_id=2,
-            hidden_size=32,
+            hidden_size=cross_attention_dim,
            intermediate_size=37,
            layer_norm_eps=1e-05,
            num_attention_heads=4,
@@ -92,8 +97,9 @@ class AnimateDiffPipelineFastTests(
        )
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
        motion_adapter = MotionAdapter(
-            block_out_channels=(32, 64),
+            block_out_channels=block_out_channels,
            motion_layers_per_block=2,
            motion_norm_num_groups=2,
            motion_num_attention_heads=4,
@@ -126,6 +132,36 @@ class AnimateDiffPipelineFastTests(
        }
        return inputs

+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
    def test_motion_unet_loading(self):
        components = self.get_dummy_components()
        pipe = AnimateDiffPipeline(**components)
@@ -141,33 +177,33 @@ class AnimateDiffPipelineFastTests(
        if torch_device == "cpu":
            expected_pipe_slice = np.array(
                [
-                    0.5541,
-                    0.5802,
-                    0.5074,
-                    0.4583,
-                    0.4729,
-                    0.5374,
-                    0.4051,
-                    0.4495,
-                    0.4480,
-                    0.5292,
-                    0.6322,
-                    0.6265,
-                    0.5455,
-                    0.4771,
-                    0.5795,
-                    0.5845,
-                    0.4172,
-                    0.6066,
-                    0.6535,
-                    0.4113,
-                    0.6833,
-                    0.5736,
-                    0.3589,
-                    0.5730,
-                    0.4205,
-                    0.3786,
-                    0.5323,
+                    0.5216,
+                    0.5620,
+                    0.4927,
+                    0.5082,
+                    0.4786,
+                    0.5932,
+                    0.5125,
+                    0.4514,
+                    0.5315,
+                    0.4694,
+                    0.3276,
+                    0.4863,
+                    0.3920,
+                    0.3684,
+                    0.5745,
+                    0.4499,
+                    0.5081,
+                    0.5414,
+                    0.6014,
+                    0.5062,
+                    0.3630,
+                    0.5296,
+                    0.6018,
+                    0.5098,
+                    0.4948,
+                    0.5101,
+                    0.5620,
                ]
            )
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
@@ -175,7 +211,7 @@ class AnimateDiffPipelineFastTests(
    def test_dict_tuple_outputs_equivalent(self):
        expected_slice = None
        if torch_device == "cpu":
-            expected_slice = np.array([0.4051, 0.4495, 0.4480, 0.5845, 0.4172, 0.6066, 0.4205, 0.3786, 0.5323])
+            expected_slice = np.array([0.5125, 0.4514, 0.5315, 0.4499, 0.5081, 0.5414, 0.4948, 0.5101, 0.5620])
        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)

    def test_inference_batch_single_identical(
@@ -279,7 +315,7 @@ class AnimateDiffPipelineFastTests(

        inputs = self.get_dummy_inputs(torch_device)
        inputs.pop("prompt")
-        inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
        pipe(**inputs)

    def test_free_init(self):

--- a/tests/pipelines/animatediff/test_animatediff_video2video.py
+++ b/tests/pipelines/animatediff/test_animatediff_video2video.py
@@ -11,6 +11,7 @@ from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
    MotionAdapter,
+    StableDiffusionPipeline,
    UNet2DConditionModel,
    UNetMotionModel,
 )
@@ -46,16 +47,19 @@ class AnimateDiffVideoToVideoPipelineFastTests(
    )

    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
        torch.manual_seed(0)
        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
+            block_out_channels=block_out_channels,
            layers_per_block=2,
-            sample_size=32,
+            sample_size=8,
            in_channels=4,
            out_channels=4,
            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
+            cross_attention_dim=cross_attention_dim,
            norm_num_groups=2,
        )
        scheduler = DDIMScheduler(
@@ -66,18 +70,19 @@ class AnimateDiffVideoToVideoPipelineFastTests(
        )
        torch.manual_seed(0)
        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
+            block_out_channels=block_out_channels,
            in_channels=3,
            out_channels=3,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
+            norm_num_groups=2,
        )
        torch.manual_seed(0)
        text_encoder_config = CLIPTextConfig(
            bos_token_id=0,
            eos_token_id=2,
-            hidden_size=32,
+            hidden_size=cross_attention_dim,
            intermediate_size=37,
            layer_norm_eps=1e-05,
            num_attention_heads=4,
@@ -87,8 +92,9 @@ class AnimateDiffVideoToVideoPipelineFastTests(
        )
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
        motion_adapter = MotionAdapter(
-            block_out_channels=(32, 64),
+            block_out_channels=block_out_channels,
            motion_layers_per_block=2,
            motion_norm_num_groups=2,
            motion_num_attention_heads=4,
@@ -127,6 +133,36 @@ class AnimateDiffVideoToVideoPipelineFastTests(
        }
        return inputs

+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
    def test_motion_unet_loading(self):
        components = self.get_dummy_components()
        pipe = AnimateDiffVideoToVideoPipeline(**components)
@@ -143,24 +179,24 @@ class AnimateDiffVideoToVideoPipelineFastTests(
        if torch_device == "cpu":
            expected_pipe_slice = np.array(
                [
-                    0.4947,
-                    0.4780,
-                    0.4340,
-                    0.4666,
-                    0.4028,
-                    0.4645,
-                    0.4915,
-                    0.4101,
-                    0.4308,
-                    0.4581,
-                    0.3582,
-                    0.4953,
-                    0.4466,
-                    0.5348,
-                    0.5863,
-                    0.5299,
+                    0.5569,
+                    0.6250,
+                    0.4145,
+                    0.5613,
+                    0.5563,
                    0.5213,
-                    0.5017,
+                    0.5092,
+                    0.4950,
+                    0.4950,
+                    0.5685,
+                    0.3858,
+                    0.4864,
+                    0.6458,
+                    0.4312,
+                    0.5518,
+                    0.5608,
+                    0.4418,
+                    0.5378,
                ]
            )
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
@@ -266,7 +302,7 @@ class AnimateDiffVideoToVideoPipelineFastTests(

        inputs = self.get_dummy_inputs(torch_device)
        inputs.pop("prompt")
-        inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
        pipe(**inputs)

    def test_latent_inputs(self):
@@ -276,7 +312,8 @@ class AnimateDiffVideoToVideoPipelineFastTests(
        pipe.to(torch_device)

        inputs = self.get_dummy_inputs(torch_device)
-        inputs["latents"] = torch.randn((1, 4, 1, 32, 32), device=torch_device)
+        sample_size = pipe.unet.config.sample_size
+        inputs["latents"] = torch.randn((1, 4, 1, sample_size, sample_size), device=torch_device)
        inputs.pop("video")
        pipe(**inputs)


--- a/tests/pipelines/pia/test_pia.py
+++ b/tests/pipelines/pia/test_pia.py
@@ -11,6 +11,7 @@ from diffusers import (
    DDIMScheduler,
    MotionAdapter,
    PIAPipeline,
+    StableDiffusionPipeline,
    UNet2DConditionModel,
    UNetMotionModel,
 )
@@ -54,16 +55,19 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
    )

    def get_dummy_components(self):
+        cross_attention_dim = 8
+        block_out_channels = (8, 8)
+
        torch.manual_seed(0)
        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
+            block_out_channels=block_out_channels,
            layers_per_block=2,
-            sample_size=32,
+            sample_size=8,
            in_channels=4,
            out_channels=4,
            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
+            cross_attention_dim=cross_attention_dim,
            norm_num_groups=2,
        )
        scheduler = DDIMScheduler(
@@ -74,18 +78,19 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
        )
        torch.manual_seed(0)
        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
+            block_out_channels=block_out_channels,
            in_channels=3,
            out_channels=3,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
+            norm_num_groups=2,
        )
        torch.manual_seed(0)
        text_encoder_config = CLIPTextConfig(
            bos_token_id=0,
            eos_token_id=2,
-            hidden_size=32,
+            hidden_size=cross_attention_dim,
            intermediate_size=37,
            layer_norm_eps=1e-05,
            num_attention_heads=4,
@@ -95,8 +100,9 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
        )
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        torch.manual_seed(0)
        motion_adapter = MotionAdapter(
-            block_out_channels=(32, 64),
+            block_out_channels=block_out_channels,
            motion_layers_per_block=2,
            motion_norm_num_groups=2,
            motion_num_attention_heads=4,
@@ -121,7 +127,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
        else:
            generator = torch.Generator(device=device).manual_seed(seed)

-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = floats_tensor((1, 3, 8, 8), rng=random.Random(seed)).to(device)
        inputs = {
            "image": image,
            "prompt": "A painting of a squirrel eating a burger",
@@ -132,6 +138,36 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
        }
        return inputs

+    def test_from_pipe_consistent_config(self):
+        assert self.original_pipeline_class == StableDiffusionPipeline
+        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
+        original_kwargs = {"requires_safety_checker": False}
+
+        # create original_pipeline_class(sd)
+        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
+
+        # original_pipeline_class(sd) -> pipeline_class
+        pipe_components = self.get_dummy_components()
+        pipe_additional_components = {}
+        for name, component in pipe_components.items():
+            if name not in pipe_original.components:
+                pipe_additional_components[name] = component
+
+        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
+
+        # pipeline_class -> original_pipeline_class(sd)
+        original_pipe_additional_components = {}
+        for name, component in pipe_original.components.items():
+            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
+                original_pipe_additional_components[name] = component
+
+        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
+
+        # compare the config
+        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
+        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
+        assert original_config_2 == original_config
+
    def test_motion_unet_loading(self):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
@@ -144,33 +180,33 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
        if torch_device == "cpu":
            expected_pipe_slice = np.array(
                [
-                    0.5609,
-                    0.5756,
-                    0.4830,
-                    0.4420,
-                    0.4547,
-                    0.5129,
-                    0.3779,
-                    0.4042,
-                    0.3772,
-                    0.4450,
-                    0.5710,
-                    0.5536,
-                    0.4835,
-                    0.4308,
-                    0.5578,
-                    0.5578,
-                    0.4395,
+                    0.5475,
+                    0.5769,
+                    0.4873,
+                    0.5064,
+                    0.4445,
+                    0.5876,
+                    0.5453,
+                    0.4102,
+                    0.5247,
+                    0.5370,
+                    0.3406,
+                    0.4322,
+                    0.3991,
+                    0.3756,
+                    0.5438,
+                    0.4780,
+                    0.5087,
+                    0.5248,
+                    0.6243,
+                    0.5506,
+                    0.3491,
                    0.5440,
-                    0.6051,
-                    0.4651,
-                    0.6258,
-                    0.5662,
-                    0.3988,
-                    0.5108,
-                    0.4153,
-                    0.3993,
-                    0.4803,
+                    0.6111,
+                    0.5122,
+                    0.5326,
+                    0.5180,
+                    0.5538,
                ]
            )
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
@@ -178,7 +214,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
    def test_dict_tuple_outputs_equivalent(self):
        expected_slice = None
        if torch_device == "cpu":
-            expected_slice = np.array([0.3740, 0.4284, 0.4038, 0.5417, 0.4405, 0.5521, 0.4273, 0.4124, 0.4997])
+            expected_slice = np.array([0.5476, 0.4092, 0.5289, 0.4755, 0.5092, 0.5186, 0.5403, 0.5287, 0.5467])
        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)

    @unittest.skip("Attention slicing is not enabled in this pipeline")
@@ -286,7 +322,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr

        inputs = self.get_dummy_inputs(torch_device)
        inputs.pop("prompt")
-        inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
+        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
        pipe(**inputs)

    def test_free_init(self):