Unverified Commit 3ae0ee88 authored by Aryan's avatar Aryan Committed by GitHub
Browse files

[tests] speed up animatediff tests (#8846)



* speed up animatediff tests

* fix pia test_ip_adapter_single

* fix tests/pipelines/pia/test_pia.py::PIAPipelineFastTests::test_dict_tuple_outputs_equivalent

* update

* fix ip adapter tests

* skip test_from_pipe_consistent_config tests

* fix prompt_embeds test

* update test_from_pipe_consistent_config tests

* fix expected_slice values

* remove temporal_norm_num_groups from UpBlockMotion

---------
Co-authored-by: default avatarSayak Paul <spsayakpaul@gmail.com>
Co-authored-by: default avatarDhruv Nair <dhruv.nair@gmail.com>
parent 5fbb4d32
......@@ -1532,7 +1532,6 @@ class UpBlockMotion(nn.Module):
resnet_pre_norm: bool = True,
output_scale_factor: float = 1.0,
add_upsample: bool = True,
temporal_norm_num_groups: int = 32,
temporal_cross_attention_dim: Optional[int] = None,
temporal_num_attention_heads: int = 8,
temporal_max_seq_length: int = 32,
......@@ -1574,7 +1573,7 @@ class UpBlockMotion(nn.Module):
num_attention_heads=temporal_num_attention_heads,
in_channels=out_channels,
num_layers=temporal_transformer_layers_per_block[i],
norm_num_groups=temporal_norm_num_groups,
norm_num_groups=resnet_groups,
cross_attention_dim=temporal_cross_attention_dim,
attention_bias=False,
activation_fn="geglu",
......
......@@ -11,6 +11,7 @@ from diffusers import (
AutoencoderKL,
DDIMScheduler,
MotionAdapter,
StableDiffusionPipeline,
UNet2DConditionModel,
UNetMotionModel,
)
......@@ -51,16 +52,19 @@ class AnimateDiffPipelineFastTests(
)
def get_dummy_components(self):
cross_attention_dim = 8
block_out_channels = (8, 8)
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
block_out_channels=block_out_channels,
layers_per_block=2,
sample_size=32,
sample_size=8,
in_channels=4,
out_channels=4,
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
cross_attention_dim=cross_attention_dim,
norm_num_groups=2,
)
scheduler = DDIMScheduler(
......@@ -71,18 +75,19 @@ class AnimateDiffPipelineFastTests(
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
block_out_channels=block_out_channels,
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
hidden_size=cross_attention_dim,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
......@@ -92,8 +97,9 @@ class AnimateDiffPipelineFastTests(
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
torch.manual_seed(0)
motion_adapter = MotionAdapter(
block_out_channels=(32, 64),
block_out_channels=block_out_channels,
motion_layers_per_block=2,
motion_norm_num_groups=2,
motion_num_attention_heads=4,
......@@ -126,6 +132,36 @@ class AnimateDiffPipelineFastTests(
}
return inputs
def test_from_pipe_consistent_config(self):
assert self.original_pipeline_class == StableDiffusionPipeline
original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
original_kwargs = {"requires_safety_checker": False}
# create original_pipeline_class(sd)
pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
# original_pipeline_class(sd) -> pipeline_class
pipe_components = self.get_dummy_components()
pipe_additional_components = {}
for name, component in pipe_components.items():
if name not in pipe_original.components:
pipe_additional_components[name] = component
pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
# pipeline_class -> original_pipeline_class(sd)
original_pipe_additional_components = {}
for name, component in pipe_original.components.items():
if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
original_pipe_additional_components[name] = component
pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
# compare the config
original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
assert original_config_2 == original_config
def test_motion_unet_loading(self):
components = self.get_dummy_components()
pipe = AnimateDiffPipeline(**components)
......@@ -141,33 +177,33 @@ class AnimateDiffPipelineFastTests(
if torch_device == "cpu":
expected_pipe_slice = np.array(
[
0.5541,
0.5802,
0.5074,
0.4583,
0.4729,
0.5374,
0.4051,
0.4495,
0.4480,
0.5292,
0.6322,
0.6265,
0.5455,
0.4771,
0.5795,
0.5845,
0.4172,
0.6066,
0.6535,
0.4113,
0.6833,
0.5736,
0.3589,
0.5730,
0.4205,
0.3786,
0.5323,
0.5216,
0.5620,
0.4927,
0.5082,
0.4786,
0.5932,
0.5125,
0.4514,
0.5315,
0.4694,
0.3276,
0.4863,
0.3920,
0.3684,
0.5745,
0.4499,
0.5081,
0.5414,
0.6014,
0.5062,
0.3630,
0.5296,
0.6018,
0.5098,
0.4948,
0.5101,
0.5620,
]
)
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
......@@ -175,7 +211,7 @@ class AnimateDiffPipelineFastTests(
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.4051, 0.4495, 0.4480, 0.5845, 0.4172, 0.6066, 0.4205, 0.3786, 0.5323])
expected_slice = np.array([0.5125, 0.4514, 0.5315, 0.4499, 0.5081, 0.5414, 0.4948, 0.5101, 0.5620])
return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
def test_inference_batch_single_identical(
......@@ -279,7 +315,7 @@ class AnimateDiffPipelineFastTests(
inputs = self.get_dummy_inputs(torch_device)
inputs.pop("prompt")
inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
pipe(**inputs)
def test_free_init(self):
......
......@@ -11,6 +11,7 @@ from diffusers import (
AutoencoderKL,
DDIMScheduler,
MotionAdapter,
StableDiffusionPipeline,
UNet2DConditionModel,
UNetMotionModel,
)
......@@ -46,16 +47,19 @@ class AnimateDiffVideoToVideoPipelineFastTests(
)
def get_dummy_components(self):
cross_attention_dim = 8
block_out_channels = (8, 8)
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
block_out_channels=block_out_channels,
layers_per_block=2,
sample_size=32,
sample_size=8,
in_channels=4,
out_channels=4,
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
cross_attention_dim=cross_attention_dim,
norm_num_groups=2,
)
scheduler = DDIMScheduler(
......@@ -66,18 +70,19 @@ class AnimateDiffVideoToVideoPipelineFastTests(
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
block_out_channels=block_out_channels,
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
hidden_size=cross_attention_dim,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
......@@ -87,8 +92,9 @@ class AnimateDiffVideoToVideoPipelineFastTests(
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
torch.manual_seed(0)
motion_adapter = MotionAdapter(
block_out_channels=(32, 64),
block_out_channels=block_out_channels,
motion_layers_per_block=2,
motion_norm_num_groups=2,
motion_num_attention_heads=4,
......@@ -127,6 +133,36 @@ class AnimateDiffVideoToVideoPipelineFastTests(
}
return inputs
def test_from_pipe_consistent_config(self):
assert self.original_pipeline_class == StableDiffusionPipeline
original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
original_kwargs = {"requires_safety_checker": False}
# create original_pipeline_class(sd)
pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
# original_pipeline_class(sd) -> pipeline_class
pipe_components = self.get_dummy_components()
pipe_additional_components = {}
for name, component in pipe_components.items():
if name not in pipe_original.components:
pipe_additional_components[name] = component
pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
# pipeline_class -> original_pipeline_class(sd)
original_pipe_additional_components = {}
for name, component in pipe_original.components.items():
if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
original_pipe_additional_components[name] = component
pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
# compare the config
original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
assert original_config_2 == original_config
def test_motion_unet_loading(self):
components = self.get_dummy_components()
pipe = AnimateDiffVideoToVideoPipeline(**components)
......@@ -143,24 +179,24 @@ class AnimateDiffVideoToVideoPipelineFastTests(
if torch_device == "cpu":
expected_pipe_slice = np.array(
[
0.4947,
0.4780,
0.4340,
0.4666,
0.4028,
0.4645,
0.4915,
0.4101,
0.4308,
0.4581,
0.3582,
0.4953,
0.4466,
0.5348,
0.5863,
0.5299,
0.5569,
0.6250,
0.4145,
0.5613,
0.5563,
0.5213,
0.5017,
0.5092,
0.4950,
0.4950,
0.5685,
0.3858,
0.4864,
0.6458,
0.4312,
0.5518,
0.5608,
0.4418,
0.5378,
]
)
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
......@@ -266,7 +302,7 @@ class AnimateDiffVideoToVideoPipelineFastTests(
inputs = self.get_dummy_inputs(torch_device)
inputs.pop("prompt")
inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
pipe(**inputs)
def test_latent_inputs(self):
......@@ -276,7 +312,8 @@ class AnimateDiffVideoToVideoPipelineFastTests(
pipe.to(torch_device)
inputs = self.get_dummy_inputs(torch_device)
inputs["latents"] = torch.randn((1, 4, 1, 32, 32), device=torch_device)
sample_size = pipe.unet.config.sample_size
inputs["latents"] = torch.randn((1, 4, 1, sample_size, sample_size), device=torch_device)
inputs.pop("video")
pipe(**inputs)
......
......@@ -11,6 +11,7 @@ from diffusers import (
DDIMScheduler,
MotionAdapter,
PIAPipeline,
StableDiffusionPipeline,
UNet2DConditionModel,
UNetMotionModel,
)
......@@ -54,16 +55,19 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
)
def get_dummy_components(self):
cross_attention_dim = 8
block_out_channels = (8, 8)
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
block_out_channels=block_out_channels,
layers_per_block=2,
sample_size=32,
sample_size=8,
in_channels=4,
out_channels=4,
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
cross_attention_dim=cross_attention_dim,
norm_num_groups=2,
)
scheduler = DDIMScheduler(
......@@ -74,18 +78,19 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
block_out_channels=block_out_channels,
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
hidden_size=cross_attention_dim,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
......@@ -95,8 +100,9 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
torch.manual_seed(0)
motion_adapter = MotionAdapter(
block_out_channels=(32, 64),
block_out_channels=block_out_channels,
motion_layers_per_block=2,
motion_norm_num_groups=2,
motion_num_attention_heads=4,
......@@ -121,7 +127,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
else:
generator = torch.Generator(device=device).manual_seed(seed)
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = floats_tensor((1, 3, 8, 8), rng=random.Random(seed)).to(device)
inputs = {
"image": image,
"prompt": "A painting of a squirrel eating a burger",
......@@ -132,6 +138,36 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
}
return inputs
def test_from_pipe_consistent_config(self):
assert self.original_pipeline_class == StableDiffusionPipeline
original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
original_kwargs = {"requires_safety_checker": False}
# create original_pipeline_class(sd)
pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
# original_pipeline_class(sd) -> pipeline_class
pipe_components = self.get_dummy_components()
pipe_additional_components = {}
for name, component in pipe_components.items():
if name not in pipe_original.components:
pipe_additional_components[name] = component
pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
# pipeline_class -> original_pipeline_class(sd)
original_pipe_additional_components = {}
for name, component in pipe_original.components.items():
if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
original_pipe_additional_components[name] = component
pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
# compare the config
original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
assert original_config_2 == original_config
def test_motion_unet_loading(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
......@@ -144,33 +180,33 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
if torch_device == "cpu":
expected_pipe_slice = np.array(
[
0.5609,
0.5756,
0.4830,
0.4420,
0.4547,
0.5129,
0.3779,
0.4042,
0.3772,
0.4450,
0.5710,
0.5536,
0.4835,
0.4308,
0.5578,
0.5578,
0.4395,
0.5475,
0.5769,
0.4873,
0.5064,
0.4445,
0.5876,
0.5453,
0.4102,
0.5247,
0.5370,
0.3406,
0.4322,
0.3991,
0.3756,
0.5438,
0.4780,
0.5087,
0.5248,
0.6243,
0.5506,
0.3491,
0.5440,
0.6051,
0.4651,
0.6258,
0.5662,
0.3988,
0.5108,
0.4153,
0.3993,
0.4803,
0.6111,
0.5122,
0.5326,
0.5180,
0.5538,
]
)
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
......@@ -178,7 +214,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.3740, 0.4284, 0.4038, 0.5417, 0.4405, 0.5521, 0.4273, 0.4124, 0.4997])
expected_slice = np.array([0.5476, 0.4092, 0.5289, 0.4755, 0.5092, 0.5186, 0.5403, 0.5287, 0.5467])
return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
@unittest.skip("Attention slicing is not enabled in this pipeline")
......@@ -286,7 +322,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
inputs = self.get_dummy_inputs(torch_device)
inputs.pop("prompt")
inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
pipe(**inputs)
def test_free_init(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment