Make sure all pipelines can run with batched input (#1669)

* [SD] Make sure batched input works correctly * uP * uP * up * up * uP * up * fix mask stuff * up * uP * more up * up * uP * up * finish * Apply suggestions from code review Co-authored-by: Pedro Cuenca <pedro@huggingface.co> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

Make sure all pipelines can run with batched input (#1669)
* [SD] Make sure batched input works correctly * uP * uP * up * up * uP * up * fix mask stuff * up * uP * more up * up * uP * up * finish * Apply suggestions from code review Co-authored-by: Pedro Cuenca <pedro@huggingface.co> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
b345c74d · Patrick von Platen · GitHub · b4170422 · b345c74d · b345c74d
Unverified Commit b345c74d authored Dec 13, 2022 by Patrick von Platen Committed by GitHub Dec 13, 2022
4 changed files
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -31,7 +31,7 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint impo
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer

 from ...test_pipelines_common import PipelineTesterMixin

@@ -78,6 +78,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
        )
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)

        components = {
            "unet": unet,
@@ -86,7 +87,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
            "text_encoder": text_encoder,
            "tokenizer": tokenizer,
            "safety_checker": None,
-            "feature_extractor": None,
+            "feature_extractor": feature_extractor,
        }
        return components


--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -136,7 +136,9 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
        return components

    def get_dummy_inputs(self, device, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
@@ -171,7 +173,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
        output_loaded = pipe_loaded(**inputs)[0]

        max_diff = np.abs(output - output_loaded).max()
-        self.assertLess(max_diff, 3e-5)
+        self.assertLess(max_diff, 1e-4)

    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
    def test_save_load_float16(self):
@@ -243,7 +245,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
        output_with_offload = pipe(**inputs)[0]

        max_diff = np.abs(output_with_offload - output_without_offload).max()
-        self.assertLess(max_diff, 3e-5, "CPU offloading should not affect the inference results")
+        self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")

    @unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet")
    def test_dict_tuple_outputs_equivalent(self):
@@ -260,7 +262,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
        output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]

        max_diff = np.abs(output - output_tuple).max()
-        self.assertLess(max_diff, 3e-5)
+        self.assertLess(max_diff, 1e-4)

    @unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet")
    def test_num_inference_steps_consistent(self):
@@ -285,7 +287,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
        if torch_device == "mps":
            expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546])
        else:
-            expected_slice = np.array([0.6907, 0.5135, 0.4688, 0.5169, 0.5738, 0.4600, 0.4435, 0.5640, 0.4653])
+            expected_slice = np.array([0.6854, 0.3740, 0.4857, 0.7130, 0.7403, 0.5536, 0.4829, 0.6182, 0.5053])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

    def test_stable_diffusion_depth2img_negative_prompt(self):
@@ -305,7 +307,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
        if torch_device == "mps":
            expected_slice = np.array([0.5825, 0.5135, 0.4095, 0.5452, 0.6059, 0.4211, 0.3994, 0.5177, 0.4335])
        else:
-            expected_slice = np.array([0.755, 0.521, 0.473, 0.554, 0.629, 0.442, 0.440, 0.582, 0.449])
+            expected_slice = np.array([0.6074, 0.3096, 0.4802, 0.7463, 0.7388, 0.5393, 0.4531, 0.5928, 0.4972])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

    def test_stable_diffusion_depth2img_multiple_init_images(self):
@@ -317,7 +319,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te

        inputs = self.get_dummy_inputs(device)
        inputs["prompt"] = [inputs["prompt"]] * 2
-        inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
+        inputs["image"] = 2 * [inputs["image"]]
        image = sd_pipe(**inputs).images
        image_slice = image[-1, -3:, -3:, -1]

@@ -326,7 +328,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
        if torch_device == "mps":
            expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551])
        else:
-            expected_slice = np.array([0.6475, 0.6302, 0.5627, 0.5222, 0.4318, 0.5489, 0.5079, 0.4419, 0.4494])
+            expected_slice = np.array([0.6681, 0.5023, 0.6611, 0.7605, 0.5724, 0.7959, 0.7240, 0.5871, 0.5383])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

    def test_stable_diffusion_depth2img_num_images_per_prompt(self):
@@ -374,7 +376,6 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te

        inputs = self.get_dummy_inputs(device)

-        inputs["image"] = Image.fromarray(inputs["image"][0].permute(1, 2, 0).numpy().astype(np.uint8))
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1]

@@ -452,7 +453,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
        image = output.images[0]

        assert image.shape == (480, 640, 3)
-        assert np.abs(expected_image - image).max() < 1e-3
+        assert np.abs(expected_image - image).max() < 5e-3

    def test_stable_diffusion_depth2img_pipeline_ddim(self):
        init_image = load_image(
@@ -540,8 +541,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
        torch.cuda.reset_peak_memory_stats()

        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/depth2img/sketch-mountains-input.jpg"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
        )
        init_image = init_image.resize((768, 512))

@@ -565,7 +565,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
            guidance_scale=7.5,
            generator=generator,
            output_type="np",
-            num_inference_steps=5,
+            num_inference_steps=2,
        )

        mem_bytes = torch.cuda.max_memory_allocated()

--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -24,7 +24,7 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeli
 from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, slow
 from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer

 from ...test_pipelines_common import PipelineTesterMixin

@@ -78,6 +78,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes
        )
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)

        components = {
            "unet": unet,
@@ -86,7 +87,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes
            "text_encoder": text_encoder,
            "tokenizer": tokenizer,
            "safety_checker": None,
-            "feature_extractor": None,
+            "feature_extractor": feature_extractor,
        }
        return components


--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -11,6 +11,7 @@ from typing import Callable, Union
 import numpy as np
 import torch

+import diffusers
 from diffusers import (
    CycleDiffusionPipeline,
    DanceDiffusionPipeline,
@@ -18,6 +19,7 @@ from diffusers import (
    StableDiffusionDepth2ImgPipeline,
    StableDiffusionImg2ImgPipeline,
 )
+from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available, is_xformers_available
 from diffusers.utils.testing_utils import require_torch, torch_device

@@ -25,6 +27,9 @@ from diffusers.utils.testing_utils import require_torch, torch_device
 torch.backends.cuda.matmul.allow_tf32 = False


+ALLOWED_REQUIRED_ARGS = ["source_prompt", "prompt", "image", "mask_image", "example_image"]
+
+
 @require_torch
 class PipelineTesterMixin:
    """
@@ -94,7 +99,80 @@ class PipelineTesterMixin:
        output_loaded = pipe_loaded(**inputs)[0]

        max_diff = np.abs(output - output_loaded).max()
-        self.assertLess(max_diff, 1e-5)
+        self.assertLess(max_diff, 1e-4)
+
+    def test_pipeline_call_implements_required_args(self):
+        assert hasattr(self.pipeline_class, "__call__"), f"{self.pipeline_class} should have a `__call__` method"
+        parameters = inspect.signature(self.pipeline_class.__call__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        required_parameters.pop("self")
+        required_parameters = set(required_parameters)
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+
+        for param in required_parameters:
+            if param == "kwargs":
+                # kwargs can be added if arguments of pipeline call function are deprecated
+                continue
+            assert param in ALLOWED_REQUIRED_ARGS
+
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+
+        required_optional_params = ["generator", "num_inference_steps", "return_dict"]
+        for param in required_optional_params:
+            assert param in optional_parameters
+
+    def test_inference_batch_consistent(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        for batch_size in [2, 4, 13]:
+            batched_inputs = {}
+            for name, value in inputs.items():
+                if name in ALLOWED_REQUIRED_ARGS:
+                    # prompt is string
+                    if name == "prompt":
+                        len_prompt = len(value)
+                        # make unequal batch sizes
+                        batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+
+                        # make last batch super long
+                        batched_inputs[name][-1] = 2000 * "very long"
+                    # or else we have images
+                    else:
+                        batched_inputs[name] = batch_size * [value]
+                elif name == "batch_size":
+                    batched_inputs[name] = batch_size
+                else:
+                    batched_inputs[name] = value
+
+            batched_inputs["num_inference_steps"] = inputs["num_inference_steps"]
+            batched_inputs["output_type"] = None
+
+            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
+                batched_inputs.pop("output_type")
+
+            output = pipe(**batched_inputs)
+
+            assert len(output[0]) == batch_size
+
+            batched_inputs["output_type"] = "np"
+
+            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
+                batched_inputs.pop("output_type")
+
+            output = pipe(**batched_inputs)[0]
+
+            assert output.shape[0] == batch_size
+
+        logger.setLevel(level=diffusers.logging.WARNING)

    def test_dict_tuple_outputs_equivalent(self):
        if torch_device == "mps" and self.pipeline_class in (
@@ -118,13 +196,7 @@ class PipelineTesterMixin:
        output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]

        max_diff = np.abs(output - output_tuple).max()
-        self.assertLess(max_diff, 1e-5)
-
-    def test_pipeline_call_implements_required_args(self):
-        required_args = ["num_inference_steps", "generator", "return_dict"]
-
-        for arg in required_args:
-            self.assertTrue(arg in inspect.signature(self.pipeline_class.__call__).parameters)
+        self.assertLess(max_diff, 1e-4)

    def test_num_inference_steps_consistent(self):
        components = self.get_dummy_components()
@@ -138,7 +210,7 @@ class PipelineTesterMixin:

        outputs = []
        times = []
-        for num_steps in [3, 6, 9]:
+        for num_steps in [9, 6, 3]:
            inputs = self.get_dummy_inputs(torch_device)
            inputs["num_inference_steps"] = num_steps

@@ -152,7 +224,7 @@ class PipelineTesterMixin:
        # check that all outputs have the same shape
        self.assertTrue(all(outputs[0].shape == output.shape for output in outputs))
        # check that the inference time increases with the number of inference steps
-        self.assertTrue(all(times[i] > times[i - 1] for i in range(1, len(times))))
+        self.assertTrue(all(times[i] < times[i - 1] for i in range(1, len(times))))

    def test_components_function(self):
        init_components = self.get_dummy_components()
@@ -257,7 +329,7 @@ class PipelineTesterMixin:
        output_loaded = pipe_loaded(**inputs)[0]

        max_diff = np.abs(output - output_loaded).max()
-        self.assertLess(max_diff, 1e-5)
+        self.assertLess(max_diff, 1e-4)

    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
    def test_to_device(self):
@@ -332,7 +404,7 @@ class PipelineTesterMixin:
        output_with_offload = pipe(**inputs)[0]

        max_diff = np.abs(output_with_offload - output_without_offload).max()
-        self.assertLess(max_diff, 1e-5, "CPU offloading should not affect the inference results")
+        self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")

    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),
@@ -355,7 +427,7 @@ class PipelineTesterMixin:
        output_with_offload = pipe(**inputs)[0]

        max_diff = np.abs(output_with_offload - output_without_offload).max()
-        self.assertLess(max_diff, 1e-5, "XFormers attention should not affect the inference results")
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")

    def test_progress_bar(self):
        components = self.get_dummy_components()