"vscode:/vscode.git/clone" did not exist on "3ad4207d1fb639af3a86d3fe5fc60406d1d7e9fe"
Unverified Commit b345c74d authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Make sure all pipelines can run with batched input (#1669)



* [SD] Make sure batched input works correctly

* uP

* uP

* up

* up

* uP

* up

* fix mask stuff

* up

* uP

* more up

* up

* uP

* up

* finish

* Apply suggestions from code review
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
parent b4170422
...@@ -31,7 +31,7 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint impo ...@@ -31,7 +31,7 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint impo
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu from diffusers.utils.testing_utils import require_torch_gpu
from PIL import Image from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from transformers import CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ...test_pipelines_common import PipelineTesterMixin from ...test_pipelines_common import PipelineTesterMixin
...@@ -78,6 +78,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test ...@@ -78,6 +78,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
) )
text_encoder = CLIPTextModel(text_encoder_config) text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = { components = {
"unet": unet, "unet": unet,
...@@ -86,7 +87,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test ...@@ -86,7 +87,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
"text_encoder": text_encoder, "text_encoder": text_encoder,
"tokenizer": tokenizer, "tokenizer": tokenizer,
"safety_checker": None, "safety_checker": None,
"feature_extractor": None, "feature_extractor": feature_extractor,
} }
return components return components
......
...@@ -136,7 +136,9 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -136,7 +136,9 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
return components return components
def get_dummy_inputs(self, device, seed=0): def get_dummy_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
image = image.cpu().permute(0, 2, 3, 1)[0]
image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
if str(device).startswith("mps"): if str(device).startswith("mps"):
generator = torch.manual_seed(seed) generator = torch.manual_seed(seed)
else: else:
...@@ -171,7 +173,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -171,7 +173,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
output_loaded = pipe_loaded(**inputs)[0] output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(output - output_loaded).max() max_diff = np.abs(output - output_loaded).max()
self.assertLess(max_diff, 3e-5) self.assertLess(max_diff, 1e-4)
@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
def test_save_load_float16(self): def test_save_load_float16(self):
...@@ -243,7 +245,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -243,7 +245,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
output_with_offload = pipe(**inputs)[0] output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(output_with_offload - output_without_offload).max() max_diff = np.abs(output_with_offload - output_without_offload).max()
self.assertLess(max_diff, 3e-5, "CPU offloading should not affect the inference results") self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
@unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet") @unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet")
def test_dict_tuple_outputs_equivalent(self): def test_dict_tuple_outputs_equivalent(self):
...@@ -260,7 +262,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -260,7 +262,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0] output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
max_diff = np.abs(output - output_tuple).max() max_diff = np.abs(output - output_tuple).max()
self.assertLess(max_diff, 3e-5) self.assertLess(max_diff, 1e-4)
@unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet") @unittest.skipIf(torch_device == "mps", reason="The depth model does not support MPS yet")
def test_num_inference_steps_consistent(self): def test_num_inference_steps_consistent(self):
...@@ -285,7 +287,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -285,7 +287,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
if torch_device == "mps": if torch_device == "mps":
expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546]) expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546])
else: else:
expected_slice = np.array([0.6907, 0.5135, 0.4688, 0.5169, 0.5738, 0.4600, 0.4435, 0.5640, 0.4653]) expected_slice = np.array([0.6854, 0.3740, 0.4857, 0.7130, 0.7403, 0.5536, 0.4829, 0.6182, 0.5053])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_stable_diffusion_depth2img_negative_prompt(self): def test_stable_diffusion_depth2img_negative_prompt(self):
...@@ -305,7 +307,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -305,7 +307,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
if torch_device == "mps": if torch_device == "mps":
expected_slice = np.array([0.5825, 0.5135, 0.4095, 0.5452, 0.6059, 0.4211, 0.3994, 0.5177, 0.4335]) expected_slice = np.array([0.5825, 0.5135, 0.4095, 0.5452, 0.6059, 0.4211, 0.3994, 0.5177, 0.4335])
else: else:
expected_slice = np.array([0.755, 0.521, 0.473, 0.554, 0.629, 0.442, 0.440, 0.582, 0.449]) expected_slice = np.array([0.6074, 0.3096, 0.4802, 0.7463, 0.7388, 0.5393, 0.4531, 0.5928, 0.4972])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_stable_diffusion_depth2img_multiple_init_images(self): def test_stable_diffusion_depth2img_multiple_init_images(self):
...@@ -317,7 +319,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -317,7 +319,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
inputs = self.get_dummy_inputs(device) inputs = self.get_dummy_inputs(device)
inputs["prompt"] = [inputs["prompt"]] * 2 inputs["prompt"] = [inputs["prompt"]] * 2
inputs["image"] = inputs["image"].repeat(2, 1, 1, 1) inputs["image"] = 2 * [inputs["image"]]
image = sd_pipe(**inputs).images image = sd_pipe(**inputs).images
image_slice = image[-1, -3:, -3:, -1] image_slice = image[-1, -3:, -3:, -1]
...@@ -326,7 +328,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -326,7 +328,7 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
if torch_device == "mps": if torch_device == "mps":
expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551]) expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551])
else: else:
expected_slice = np.array([0.6475, 0.6302, 0.5627, 0.5222, 0.4318, 0.5489, 0.5079, 0.4419, 0.4494]) expected_slice = np.array([0.6681, 0.5023, 0.6611, 0.7605, 0.5724, 0.7959, 0.7240, 0.5871, 0.5383])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_stable_diffusion_depth2img_num_images_per_prompt(self): def test_stable_diffusion_depth2img_num_images_per_prompt(self):
...@@ -374,7 +376,6 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te ...@@ -374,7 +376,6 @@ class StableDiffusiondepth2imgPipelineFastTests(PipelineTesterMixin, unittest.Te
inputs = self.get_dummy_inputs(device) inputs = self.get_dummy_inputs(device)
inputs["image"] = Image.fromarray(inputs["image"][0].permute(1, 2, 0).numpy().astype(np.uint8))
image = sd_pipe(**inputs).images image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1] image_slice = image[0, -3:, -3:, -1]
...@@ -452,7 +453,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase): ...@@ -452,7 +453,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
image = output.images[0] image = output.images[0]
assert image.shape == (480, 640, 3) assert image.shape == (480, 640, 3)
assert np.abs(expected_image - image).max() < 1e-3 assert np.abs(expected_image - image).max() < 5e-3
def test_stable_diffusion_depth2img_pipeline_ddim(self): def test_stable_diffusion_depth2img_pipeline_ddim(self):
init_image = load_image( init_image = load_image(
...@@ -540,8 +541,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase): ...@@ -540,8 +541,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
torch.cuda.reset_peak_memory_stats() torch.cuda.reset_peak_memory_stats()
init_image = load_image( init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
"/depth2img/sketch-mountains-input.jpg"
) )
init_image = init_image.resize((768, 512)) init_image = init_image.resize((768, 512))
...@@ -565,7 +565,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase): ...@@ -565,7 +565,7 @@ class StableDiffusionDepth2ImgPipelineIntegrationTests(unittest.TestCase):
guidance_scale=7.5, guidance_scale=7.5,
generator=generator, generator=generator,
output_type="np", output_type="np",
num_inference_steps=5, num_inference_steps=2,
) )
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = torch.cuda.max_memory_allocated()
......
...@@ -24,7 +24,7 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeli ...@@ -24,7 +24,7 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeli
from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
from diffusers.utils.testing_utils import require_torch_gpu, slow from diffusers.utils.testing_utils import require_torch_gpu, slow
from PIL import Image from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from transformers import CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from ...test_pipelines_common import PipelineTesterMixin from ...test_pipelines_common import PipelineTesterMixin
...@@ -78,6 +78,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes ...@@ -78,6 +78,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes
) )
text_encoder = CLIPTextModel(text_encoder_config) text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = { components = {
"unet": unet, "unet": unet,
...@@ -86,7 +87,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes ...@@ -86,7 +87,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.Tes
"text_encoder": text_encoder, "text_encoder": text_encoder,
"tokenizer": tokenizer, "tokenizer": tokenizer,
"safety_checker": None, "safety_checker": None,
"feature_extractor": None, "feature_extractor": feature_extractor,
} }
return components return components
......
...@@ -11,6 +11,7 @@ from typing import Callable, Union ...@@ -11,6 +11,7 @@ from typing import Callable, Union
import numpy as np import numpy as np
import torch import torch
import diffusers
from diffusers import ( from diffusers import (
CycleDiffusionPipeline, CycleDiffusionPipeline,
DanceDiffusionPipeline, DanceDiffusionPipeline,
...@@ -18,6 +19,7 @@ from diffusers import ( ...@@ -18,6 +19,7 @@ from diffusers import (
StableDiffusionDepth2ImgPipeline, StableDiffusionDepth2ImgPipeline,
StableDiffusionImg2ImgPipeline, StableDiffusionImg2ImgPipeline,
) )
from diffusers.utils import logging
from diffusers.utils.import_utils import is_accelerate_available, is_xformers_available from diffusers.utils.import_utils import is_accelerate_available, is_xformers_available
from diffusers.utils.testing_utils import require_torch, torch_device from diffusers.utils.testing_utils import require_torch, torch_device
...@@ -25,6 +27,9 @@ from diffusers.utils.testing_utils import require_torch, torch_device ...@@ -25,6 +27,9 @@ from diffusers.utils.testing_utils import require_torch, torch_device
torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cuda.matmul.allow_tf32 = False
ALLOWED_REQUIRED_ARGS = ["source_prompt", "prompt", "image", "mask_image", "example_image"]
@require_torch @require_torch
class PipelineTesterMixin: class PipelineTesterMixin:
""" """
...@@ -94,7 +99,80 @@ class PipelineTesterMixin: ...@@ -94,7 +99,80 @@ class PipelineTesterMixin:
output_loaded = pipe_loaded(**inputs)[0] output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(output - output_loaded).max() max_diff = np.abs(output - output_loaded).max()
self.assertLess(max_diff, 1e-5) self.assertLess(max_diff, 1e-4)
def test_pipeline_call_implements_required_args(self):
assert hasattr(self.pipeline_class, "__call__"), f"{self.pipeline_class} should have a `__call__` method"
parameters = inspect.signature(self.pipeline_class.__call__).parameters
required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
required_parameters.pop("self")
required_parameters = set(required_parameters)
optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
for param in required_parameters:
if param == "kwargs":
# kwargs can be added if arguments of pipeline call function are deprecated
continue
assert param in ALLOWED_REQUIRED_ARGS
optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
required_optional_params = ["generator", "num_inference_steps", "return_dict"]
for param in required_optional_params:
assert param in optional_parameters
def test_inference_batch_consistent(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
logger = logging.get_logger(pipe.__module__)
logger.setLevel(level=diffusers.logging.FATAL)
# batchify inputs
for batch_size in [2, 4, 13]:
batched_inputs = {}
for name, value in inputs.items():
if name in ALLOWED_REQUIRED_ARGS:
# prompt is string
if name == "prompt":
len_prompt = len(value)
# make unequal batch sizes
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
# make last batch super long
batched_inputs[name][-1] = 2000 * "very long"
# or else we have images
else:
batched_inputs[name] = batch_size * [value]
elif name == "batch_size":
batched_inputs[name] = batch_size
else:
batched_inputs[name] = value
batched_inputs["num_inference_steps"] = inputs["num_inference_steps"]
batched_inputs["output_type"] = None
if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
batched_inputs.pop("output_type")
output = pipe(**batched_inputs)
assert len(output[0]) == batch_size
batched_inputs["output_type"] = "np"
if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
batched_inputs.pop("output_type")
output = pipe(**batched_inputs)[0]
assert output.shape[0] == batch_size
logger.setLevel(level=diffusers.logging.WARNING)
def test_dict_tuple_outputs_equivalent(self): def test_dict_tuple_outputs_equivalent(self):
if torch_device == "mps" and self.pipeline_class in ( if torch_device == "mps" and self.pipeline_class in (
...@@ -118,13 +196,7 @@ class PipelineTesterMixin: ...@@ -118,13 +196,7 @@ class PipelineTesterMixin:
output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0] output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
max_diff = np.abs(output - output_tuple).max() max_diff = np.abs(output - output_tuple).max()
self.assertLess(max_diff, 1e-5) self.assertLess(max_diff, 1e-4)
def test_pipeline_call_implements_required_args(self):
required_args = ["num_inference_steps", "generator", "return_dict"]
for arg in required_args:
self.assertTrue(arg in inspect.signature(self.pipeline_class.__call__).parameters)
def test_num_inference_steps_consistent(self): def test_num_inference_steps_consistent(self):
components = self.get_dummy_components() components = self.get_dummy_components()
...@@ -138,7 +210,7 @@ class PipelineTesterMixin: ...@@ -138,7 +210,7 @@ class PipelineTesterMixin:
outputs = [] outputs = []
times = [] times = []
for num_steps in [3, 6, 9]: for num_steps in [9, 6, 3]:
inputs = self.get_dummy_inputs(torch_device) inputs = self.get_dummy_inputs(torch_device)
inputs["num_inference_steps"] = num_steps inputs["num_inference_steps"] = num_steps
...@@ -152,7 +224,7 @@ class PipelineTesterMixin: ...@@ -152,7 +224,7 @@ class PipelineTesterMixin:
# check that all outputs have the same shape # check that all outputs have the same shape
self.assertTrue(all(outputs[0].shape == output.shape for output in outputs)) self.assertTrue(all(outputs[0].shape == output.shape for output in outputs))
# check that the inference time increases with the number of inference steps # check that the inference time increases with the number of inference steps
self.assertTrue(all(times[i] > times[i - 1] for i in range(1, len(times)))) self.assertTrue(all(times[i] < times[i - 1] for i in range(1, len(times))))
def test_components_function(self): def test_components_function(self):
init_components = self.get_dummy_components() init_components = self.get_dummy_components()
...@@ -257,7 +329,7 @@ class PipelineTesterMixin: ...@@ -257,7 +329,7 @@ class PipelineTesterMixin:
output_loaded = pipe_loaded(**inputs)[0] output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(output - output_loaded).max() max_diff = np.abs(output - output_loaded).max()
self.assertLess(max_diff, 1e-5) self.assertLess(max_diff, 1e-4)
@unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices") @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
def test_to_device(self): def test_to_device(self):
...@@ -332,7 +404,7 @@ class PipelineTesterMixin: ...@@ -332,7 +404,7 @@ class PipelineTesterMixin:
output_with_offload = pipe(**inputs)[0] output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(output_with_offload - output_without_offload).max() max_diff = np.abs(output_with_offload - output_without_offload).max()
self.assertLess(max_diff, 1e-5, "CPU offloading should not affect the inference results") self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
@unittest.skipIf( @unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(), torch_device != "cuda" or not is_xformers_available(),
...@@ -355,7 +427,7 @@ class PipelineTesterMixin: ...@@ -355,7 +427,7 @@ class PipelineTesterMixin:
output_with_offload = pipe(**inputs)[0] output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(output_with_offload - output_without_offload).max() max_diff = np.abs(output_with_offload - output_without_offload).max()
self.assertLess(max_diff, 1e-5, "XFormers attention should not affect the inference results") self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
def test_progress_bar(self): def test_progress_bar(self):
components = self.get_dummy_components() components = self.get_dummy_components()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment