"src/vscode:/vscode.git/clone" did not exist on "dc622a95d0c8dcd1a0badbe6cf6e61ef4eca3917"
Unverified Commit 6886e28f authored by YiYi Xu's avatar YiYi Xu Committed by GitHub
Browse files

fix a bug in inpaint pipeline when use regular text2image unet (#5033)



* fix

* fix num_images_per_prompt >1

* other pipelines

* add fast tests for inpaint pipelines

---------
Co-authored-by: default avataryiyixuxu <yixu310@gmail,com>
parent b089102a
......@@ -852,6 +852,7 @@ class StableDiffusionControlNetInpaintPipeline(
image_latents = image
else:
image_latents = self._encode_vae_image(image=image, generator=generator)
image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
if latents is None:
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
......@@ -1307,8 +1308,11 @@ class StableDiffusionControlNetInpaintPipeline(
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if num_channels_unet == 4:
init_latents_proper = image_latents[:1]
init_mask = mask[:1]
init_latents_proper = image_latents
if do_classifier_free_guidance:
init_mask, _ = mask.chunk(2)
else:
init_mask = mask
if i < len(timesteps) - 1:
noise_timestep = timesteps[i + 1]
......
......@@ -742,6 +742,8 @@ class StableDiffusionXLControlNetInpaintPipeline(
image = image.to(device=device, dtype=dtype)
image_latents = self._encode_vae_image(image=image, generator=generator)
image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
if latents is None and add_noise:
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
# if strength is 1. then initialise the latents to noise, else initial to image + noise
......@@ -1461,8 +1463,11 @@ class StableDiffusionXLControlNetInpaintPipeline(
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if num_channels_unet == 4:
init_latents_proper = image_latents[:1]
init_mask = mask[:1]
init_latents_proper = image_latents
if do_classifier_free_guidance:
init_mask, _ = mask.chunk(2)
else:
init_mask = mask
if i < len(timesteps) - 1:
noise_timestep = timesteps[i + 1]
......
......@@ -605,6 +605,7 @@ class StableDiffusionInpaintPipeline(
image_latents = image
else:
image_latents = self._encode_vae_image(image=image, generator=generator)
image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
if latents is None:
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
......@@ -997,10 +998,12 @@ class StableDiffusionInpaintPipeline(
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if num_channels_unet == 4:
init_latents_proper = image_latents[:1]
init_mask = mask[:1]
init_latents_proper = image_latents
if do_classifier_free_guidance:
init_mask, _ = mask.chunk(2)
else:
init_mask = mask
if i < len(timesteps) - 1:
noise_timestep = timesteps[i + 1]
......
......@@ -667,6 +667,7 @@ class StableDiffusionXLInpaintPipeline(
elif return_image_latents or (latents is None and not is_strength_max):
image = image.to(device=device, dtype=dtype)
image_latents = self._encode_vae_image(image=image, generator=generator)
image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
if latents is None and add_noise:
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
......@@ -1306,8 +1307,11 @@ class StableDiffusionXLInpaintPipeline(
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
if num_channels_unet == 4:
init_latents_proper = image_latents[:1]
init_mask = mask[:1]
init_latents_proper = image_latents
if do_classifier_free_guidance:
init_mask, _ = mask.chunk(2)
else:
init_mask = mask
if i < len(timesteps) - 1:
noise_timestep = timesteps[i + 1]
......
......@@ -365,6 +365,35 @@ class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipeli
}
return components
def get_dummy_inputs_2images(self, device, seed=0, img_res=64):
# Get random floats in [0, 1] as image with spatial size (img_res, img_res)
image1 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed)).to(device)
image2 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed + 22)).to(device)
# Convert images to [-1, 1]
init_image1 = 2.0 * image1 - 1.0
init_image2 = 2.0 * image2 - 1.0
# empty mask
mask_image = torch.zeros((1, 1, img_res, img_res), device=device)
if str(device).startswith("mps"):
generator1 = torch.manual_seed(seed)
generator2 = torch.manual_seed(seed)
else:
generator1 = torch.Generator(device=device).manual_seed(seed)
generator2 = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": ["A painting of a squirrel eating a burger"] * 2,
"image": [init_image1, init_image2],
"mask_image": [mask_image] * 2,
"generator": [generator1, generator2],
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "numpy",
}
return inputs
def test_stable_diffusion_inpaint(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
......@@ -385,6 +414,37 @@ class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipeli
def test_stable_diffusion_inpaint_lora(self):
...
def test_stable_diffusion_inpaint_2_images(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
# test to confirm if we pass two same image, we will get same output
inputs = self.get_dummy_inputs(device)
gen1 = torch.Generator(device=device).manual_seed(0)
gen2 = torch.Generator(device=device).manual_seed(0)
for name in ["prompt", "image", "mask_image"]:
inputs[name] = [inputs[name]] * 2
inputs["generator"] = [gen1, gen2]
images = sd_pipe(**inputs).images
assert images.shape == (2, 64, 64, 3)
image_slice1 = images[0, -3:, -3:, -1]
image_slice2 = images[1, -3:, -3:, -1]
assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() < 1e-4
# test to confirm that if we pass two different images, we will get different output
inputs = self.get_dummy_inputs_2images(device)
images = sd_pipe(**inputs).images
assert images.shape == (2, 64, 64, 3)
image_slice1 = images[0, -3:, -3:, -1]
image_slice2 = images[1, -3:, -3:, -1]
assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() > 1e-2
@slow
@require_torch_gpu
......
......@@ -143,6 +143,35 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
}
return inputs
def get_dummy_inputs_2images(self, device, seed=0, img_res=64):
# Get random floats in [0, 1] as image with spatial size (img_res, img_res)
image1 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed)).to(device)
image2 = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed + 22)).to(device)
# Convert images to [-1, 1]
init_image1 = 2.0 * image1 - 1.0
init_image2 = 2.0 * image2 - 1.0
# empty mask
mask_image = torch.zeros((1, 1, img_res, img_res), device=device)
if str(device).startswith("mps"):
generator1 = torch.manual_seed(seed)
generator2 = torch.manual_seed(seed)
else:
generator1 = torch.Generator(device=device).manual_seed(seed)
generator2 = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": ["A painting of a squirrel eating a burger"] * 2,
"image": [init_image1, init_image2],
"mask_image": [mask_image] * 2,
"generator": [generator1, generator2],
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def test_components_function(self):
init_components = self.get_dummy_components()
init_components.pop("requires_aesthetics_score")
......@@ -530,3 +559,34 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
inputs["generator"] = generator
out_1 = sd_pipe(**inputs).images
assert np.abs(out_0 - out_1).max() < 1e-2
def test_stable_diffusion_xl_inpaint_2_images(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
# test to confirm if we pass two same image, we will get same output
inputs = self.get_dummy_inputs(device)
gen1 = torch.Generator(device=device).manual_seed(0)
gen2 = torch.Generator(device=device).manual_seed(0)
for name in ["prompt", "image", "mask_image"]:
inputs[name] = [inputs[name]] * 2
inputs["generator"] = [gen1, gen2]
images = sd_pipe(**inputs).images
assert images.shape == (2, 64, 64, 3)
image_slice1 = images[0, -3:, -3:, -1]
image_slice2 = images[1, -3:, -3:, -1]
assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() < 1e-4
# test to confirm that if we pass two different images, we will get different output
inputs = self.get_dummy_inputs_2images(device)
images = sd_pipe(**inputs).images
assert images.shape == (2, 64, 64, 3)
image_slice1 = images[0, -3:, -3:, -1]
image_slice2 = images[1, -3:, -3:, -1]
assert np.abs(image_slice1.flatten() - image_slice2.flatten()).max() > 1e-2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment