Unverified Commit b345c74d authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Make sure all pipelines can run with batched input (#1669)



* [SD] Make sure batched input works correctly

* uP

* uP

* up

* up

* uP

* up

* fix mask stuff

* up

* uP

* more up

* up

* uP

* up

* finish

* Apply suggestions from code review
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
parent b4170422
......@@ -31,7 +31,7 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint impo
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu
from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from transformers import CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ...test_pipelines_common import PipelineTesterMixin
......@@ -78,6 +78,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = {
"unet": unet,
......@@ -86,7 +87,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"feature_extractor": feature_extractor,
}
return components
......
......@@ -136,7 +136,9 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
return components
def get_dummy_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
image = image.cpu().permute(0, 2, 3, 1)[0]
image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
......@@ -171,7 +173,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(output - output_loaded).max()
self.assertLess(max_diff, 3e-5)
self.assertLess(max_diff, 1e-4)
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self):
......@@ -243,7 +245,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(output_with_offload - output_without_offload).max()
self.assertLess(max_diff, 3e-5, "CPU offloading should not affect the inference results")
self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
@unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet")
def test_dict_tuple_outputs_equivalent(self):
......@@ -260,7 +262,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
max_diff = np.abs(output - output_tuple).max()
self.assertLess(max_diff, 3e-5)
self.assertLess(max_diff, 1e-4)
@unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet")
def test_num_inference_steps_consistent(self):
......@@ -285,7 +287,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
if torch_device == "mps":
expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546])
else:
expected_slice = np.array([0.6907, 0.5135, 0.4688, 0.5169, 0.5738, 0.4600, 0.4435, 0.5640, 0.4653])
expected_slice = np.array([0.6854, 0.3740, 0.4857, 0.7130, 0.7403, 0.5536, 0.4829, 0.6182, 0.5053])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_stable_diffusion_depth2img_negative_prompt(self):
......@@ -305,7 +307,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
if torch_device == "mps":
expected_slice = np.array([0.5825, 0.5135, 0.4095, 0.5452, 0.6059, 0.4211, 0.3994, 0.5177, 0.4335])
else:
expected_slice = np.array([0.755, 0.521, 0.473, 0.554, 0.629, 0.442, 0.440, 0.582, 0.449])
expected_slice = np.array([0.6074, 0.3096, 0.4802, 0.7463, 0.7388, 0.5393, 0.4531, 0.5928, 0.4972])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_stable_diffusion_depth2img_multiple_init_images(self):
......@@ -317,7 +319,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
inputs = self.get_dummy_inputs(device)
inputs["prompt"] = [inputs["prompt"]] * 2
inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
inputs["image"] = 2 * [inputs["image"]]
image = sd_pipe(**inputs).images
image_slice = image[-1, -3:, -3:, -1]
......@@ -326,7 +328,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
if torch_device == "mps":
expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551])
else:
expected_slice = np.array([0.6475, 0.6302, 0.5627, 0.5222, 0.4318, 0.5489, 0.5079, 0.4419, 0.4494])
expected_slice = np.array([0.6681, 0.5023, 0.6611, 0.7605, 0.5724, 0.7959, 0.7240, 0.5871, 0.5383])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_stable_diffusion_depth2img_num_images_per_prompt(self):
......@@ -374,7 +376,6 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
inputs = self.get_dummy_inputs(device)
inputs["image"] = Image.fromarray(inputs["image"][0].permute(1, 2, 0).numpy().astype(np.uint8))
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
......@@ -452,7 +453,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
image = output.images[0]
assert image.shape == (480, 640, 3)
assert np.abs(expected_image - image).max() < 1e-3
assert np.abs(expected_image - image).max() < 5e-3
def test_stable_diffusion_depth2img_pipeline_ddim(self):
init_image = load_image(
......@@ -540,8 +541,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
torch.cuda.reset_peak_memory_stats()
init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/depth2img/sketch-mountains-input.jpg"
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
)
init_image = init_image.resize((768, 512))
......@@ -565,7 +565,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
guidance_scale=7.5,
generator=generator,
output_type="np",
num_inference_steps=5,
num_inference_steps=2,
)
mem_bytes = torch.cuda.max_memory_allocated()
......
......@@ -24,7 +24,7 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeli
from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
from diffusers.utils.testing_utils import require_torch_gpu, slow
from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from transformers import CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ...test_pipelines_common import PipelineTesterMixin
......@@ -78,6 +78,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = {
"unet": unet,
......@@ -86,7 +87,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"feature_extractor": feature_extractor,
}
return components
......
......@@ -11,6 +11,7 @@ from typing import Callable, Union
import numpy as np
import torch
import diffusers
from diffusers import (
CycleDiffusionPipeline,
DanceDiffusionPipeline,
......@@ -18,6 +19,7 @@ from diffusers import (
StableDiffusionDepth2ImgPipeline,
StableDiffusionImg2ImgPipeline,
)
from diffusers.utils import logging
from diffusers.utils.import_utils import is_accelerate_available, is_xformers_available
from diffusers.utils.testing_utils import require_torch, torch_device
......@@ -25,6 +27,9 @@ from diffusers.utils.testing_utils import require_torch, torch_device
torch.backends.cuda.matmul.allow_tf32 = False
ALLOWED_REQUIRED_ARGS = ["source_prompt", "prompt", "image", "mask_image", "example_image"]
@require_torch
class PipelineTesterMixin:
"""
......@@ -94,7 +99,80 @@ class PipelineTesterMixin:
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(output - output_loaded).max()
self.assertLess(max_diff, 1e-5)
self.assertLess(max_diff, 1e-4)
def test_pipeline_call_implements_required_args(self):
assert hasattr(self.pipeline_class, "__call__"), f"{self.pipeline_class} should have a `__call__` method"
parameters = inspect.signature(self.pipeline_class.__call__).parameters
required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
required_parameters.pop("self")
required_parameters = set(required_parameters)
optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
for param in required_parameters:
if param == "kwargs":
# kwargs can be added if arguments of pipeline call function are deprecated
continue
assert param in ALLOWED_REQUIRED_ARGS
optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
required_optional_params = ["generator", "num_inference_steps", "return_dict"]
for param in required_optional_params:
assert param in optional_parameters
def test_inference_batch_consistent(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
logger = logging.get_logger(pipe.__module__)
logger.setLevel(level=diffusers.logging.FATAL)
# batchify inputs
for batch_size in [2, 4, 13]:
batched_inputs = {}
for name, value in inputs.items():
if name in ALLOWED_REQUIRED_ARGS:
# prompt is string
if name == "prompt":
len_prompt = len(value)
# make unequal batch sizes
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
# make last batch super long
batched_inputs[name][-1] = 2000 * "very long"
# or else we have images
else:
batched_inputs[name] = batch_size * [value]
elif name == "batch_size":
batched_inputs[name] = batch_size
else:
batched_inputs[name] = value
batched_inputs["num_inference_steps"] = inputs["num_inference_steps"]
batched_inputs["output_type"] = None
if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
batched_inputs.pop("output_type")
output = pipe(**batched_inputs)
assert len(output[0]) == batch_size
batched_inputs["output_type"] = "np"
if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
batched_inputs.pop("output_type")
output = pipe(**batched_inputs)[0]
assert output.shape[0] == batch_size
logger.setLevel(level=diffusers.logging.WARNING)
def test_dict_tuple_outputs_equivalent(self):
if torch_device == "mps" and self.pipeline_class in (
......@@ -118,13 +196,7 @@ class PipelineTesterMixin:
output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
max_diff = np.abs(output - output_tuple).max()
self.assertLess(max_diff, 1e-5)
def test_pipeline_call_implements_required_args(self):
required_args = ["num_inference_steps", "generator", "return_dict"]
for arg in required_args:
self.assertTrue(arg in inspect.signature(self.pipeline_class.__call__).parameters)
self.assertLess(max_diff, 1e-4)
def test_num_inference_steps_consistent(self):
components = self.get_dummy_components()
......@@ -138,7 +210,7 @@ class PipelineTesterMixin:
outputs = []
times = []
for num_steps in [3, 6, 9]:
for num_steps in [9, 6, 3]:
inputs = self.get_dummy_inputs(torch_device)
inputs["num_inference_steps"] = num_steps
......@@ -152,7 +224,7 @@ class PipelineTesterMixin:
# check that all outputs have the same shape
self.assertTrue(all(outputs[0].shape == output.shape for output in outputs))
# check that the inference time increases with the number of inference steps
self.assertTrue(all(times[i] > times[i - 1] for i in range(1, len(times))))
self.assertTrue(all(times[i] < times[i - 1] for i in range(1, len(times))))
def test_components_function(self):
init_components = self.get_dummy_components()
......@@ -257,7 +329,7 @@ class PipelineTesterMixin:
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(output - output_loaded).max()
self.assertLess(max_diff, 1e-5)
self.assertLess(max_diff, 1e-4)
@unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
def test_to_device(self):
......@@ -332,7 +404,7 @@ class PipelineTesterMixin:
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(output_with_offload - output_without_offload).max()
self.assertLess(max_diff, 1e-5, "CPU offloading should not affect the inference results")
self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
......@@ -355,7 +427,7 @@ class PipelineTesterMixin:
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(output_with_offload - output_without_offload).max()
self.assertLess(max_diff, 1e-5, "XFormers attention should not affect the inference results")
self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
def test_progress_bar(self):
components = self.get_dummy_components()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment