Unverified Commit 52eb0348 authored by fboulnois's avatar fboulnois Committed by GitHub
Browse files

Standardize on using `image` argument in all pipelines (#1361)

* feat: switch core pipelines to use image arg

* test: update tests for core pipelines

* feat: switch examples to use image arg

* docs: update docs to use image arg

* style: format code using black and doc-builder

* fix: deprecate use of init_image in all pipelines
parent 2bbf8b67
......@@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512))
prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png")
```
......
......@@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant"
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
init_image=init_image,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.8,
......@@ -83,7 +83,7 @@ torch.manual_seed(0)
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
init_image=init_image,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.85,
......
......@@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512))
prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png")
```
......
......@@ -177,7 +177,7 @@ init_image = download_image(
prompt = "A fantasy landscape, trending on artstation"
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
### Inpainting
......@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512))
mask_image = download_image(mask_url).resize((512, 512))
prompt = "a cat sitting on a bench"
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
```
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
......
......@@ -37,7 +37,7 @@ init_image.thumbnail((768, 768))
prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png")
```
......
......@@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di
prompt = "A fantasy landscape, trending on artstation"
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
### Inpainting
......@@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512))
mask_image = download_image(mask_url).resize((512, 512))
prompt = "a cat sitting on a bench"
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
```
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
......@@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
res = pipe.train(
prompt,
init_image,
image=init_image,
guidance_scale=7.5,
num_inference_steps=50,
generator=generator)
......
......@@ -17,7 +17,7 @@ from diffusers.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from diffusers.utils import logging
from diffusers.utils import deprecate, logging
# TODO: remove and import from diffusers.utils when the new version of diffusers is released
from packaging import version
......@@ -133,7 +133,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
def train(
self,
prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
height: Optional[int] = 512,
width: Optional[int] = 512,
generator: Optional[torch.Generator] = None,
......@@ -184,6 +184,10 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
accelerator = Accelerator(
gradient_accumulation_steps=1,
mixed_precision="fp16",
......@@ -241,14 +245,14 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
lr=embedding_learning_rate,
)
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
latents_dtype = text_embeddings.dtype
init_image = init_image.to(device=self.device, dtype=latents_dtype)
init_latent_image_dist = self.vae.encode(init_image).latent_dist
init_image_latents = init_latent_image_dist.sample(generator=generator)
init_image_latents = 0.18215 * init_image_latents
image = image.to(device=self.device, dtype=latents_dtype)
init_latent_image_dist = self.vae.encode(image).latent_dist
image_latents = init_latent_image_dist.sample(generator=generator)
image_latents = 0.18215 * image_latents
progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
progress_bar.set_description("Steps")
......@@ -259,12 +263,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
for _ in range(text_embedding_optimization_steps):
with accelerator.accumulate(text_embeddings):
# Sample noise that we'll add to the latents
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
noise = torch.randn(image_latents.shape).to(image_latents.device)
timesteps = torch.randint(1000, (1,), device=image_latents.device)
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
# Predict the noise residual
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
......@@ -301,12 +305,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
for _ in range(model_fine_tuning_optimization_steps):
with accelerator.accumulate(self.unet.parameters()):
# Sample noise that we'll add to the latents
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
noise = torch.randn(image_latents.shape).to(image_latents.device)
timesteps = torch.randint(1000, (1,), device=image_latents.device)
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
# Predict the noise residual
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
......
......@@ -555,7 +555,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
self,
prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None,
init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
image: Union[torch.FloatTensor, PIL.Image.Image] = None,
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
height: int = 512,
width: int = 512,
......@@ -583,11 +583,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
......@@ -605,11 +605,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0):
......@@ -648,6 +648,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str):
batch_size = 1
......@@ -714,7 +717,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
mask = None
noise = None
if init_image is None:
if image is None:
# get the initial random noise unless the user supplied it
# Unlike in other pipelines, latents need to be generated in the target device
......@@ -753,11 +756,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
else:
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess_image(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess_image(image)
# encode the init image into latents and scale the latents
init_image = init_image.to(device=self.device, dtype=latents_dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist
image = image.to(device=self.device, dtype=latents_dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
......@@ -772,7 +775,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# check sizes
if not mask.shape == init_latents.shape:
raise ValueError("The mask and init_image should be the same size!")
raise ValueError("The mask and image should be the same size!")
# get the original timestep using init_timestep
offset = self.scheduler.config.get("steps_offset", 0)
......@@ -961,7 +964,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def img2img(
self,
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None,
strength: float = 0.8,
......@@ -980,7 +983,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r"""
Function for image-to-image generation.
Args:
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
prompt (`str` or `List[str]`):
......@@ -989,11 +992,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
......@@ -1035,7 +1038,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__(
prompt=prompt,
negative_prompt=negative_prompt,
init_image=init_image,
image=image,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
strength=strength,
......@@ -1052,7 +1055,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def inpaint(
self,
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None,
......@@ -1072,11 +1075,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r"""
Function for inpaint.
Args:
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
......@@ -1088,7 +1091,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
is 1, the denoising process will be run on the masked area for the full number of iterations specified
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
num_inference_steps (`int`, *optional*, defaults to 50):
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
......@@ -1131,7 +1134,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__(
prompt=prompt,
negative_prompt=negative_prompt,
init_image=init_image,
image=image,
mask_image=mask_image,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
......
......@@ -10,7 +10,7 @@ from diffusers.onnx_utils import OnnxRuntimeModel
from diffusers.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from diffusers.utils import logging
from diffusers.utils import deprecate, logging
# TODO: remove and import from diffusers.utils when the new version of diffusers is released
from packaging import version
......@@ -441,7 +441,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
self,
prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None,
init_image: Union[np.ndarray, PIL.Image.Image] = None,
image: Union[np.ndarray, PIL.Image.Image] = None,
mask_image: Union[np.ndarray, PIL.Image.Image] = None,
height: int = 512,
width: int = 512,
......@@ -469,11 +469,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
init_image (`np.ndarray` or `PIL.Image.Image`):
image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
mask_image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
......@@ -491,11 +491,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0):
......@@ -533,6 +533,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str):
batch_size = 1
......@@ -598,7 +601,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
mask = None
noise = None
if init_image is None:
if image is None:
latents_shape = (
batch_size * num_images_per_prompt,
4,
......@@ -616,11 +619,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
else:
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess_image(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess_image(image)
# encode the init image into latents and scale the latents
init_image = init_image.astype(latents_dtype)
init_latents = self.vae_encoder(sample=init_image)[0]
image = image.astype(latents_dtype)
init_latents = self.vae_encoder(sample=image)[0]
init_latents = 0.18215 * init_latents
init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
init_latents_orig = init_latents
......@@ -635,7 +638,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# check sizes
if not mask.shape == init_latents.shape:
print(mask.shape, init_latents.shape)
raise ValueError("The mask and init_image should be the same size!")
raise ValueError("The mask and image should be the same size!")
# get the original timestep using init_timestep
offset = self.scheduler.config.get("steps_offset", 0)
......@@ -828,7 +831,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def img2img(
self,
init_image: Union[np.ndarray, PIL.Image.Image],
image: Union[np.ndarray, PIL.Image.Image],
prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None,
strength: float = 0.8,
......@@ -847,7 +850,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r"""
Function for image-to-image generation.
Args:
init_image (`np.ndarray` or `PIL.Image.Image`):
image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or ndarray representing an image batch, that will be used as the starting point for the
process.
prompt (`str` or `List[str]`):
......@@ -856,11 +859,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
......@@ -901,7 +904,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__(
prompt=prompt,
negative_prompt=negative_prompt,
init_image=init_image,
image=image,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
strength=strength,
......@@ -918,7 +921,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def inpaint(
self,
init_image: Union[np.ndarray, PIL.Image.Image],
image: Union[np.ndarray, PIL.Image.Image],
mask_image: Union[np.ndarray, PIL.Image.Image],
prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None,
......@@ -938,11 +941,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r"""
Function for inpaint.
Args:
init_image (`np.ndarray` or `PIL.Image.Image`):
image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted.
mask_image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
......@@ -954,7 +957,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
is 1, the denoising process will be run on the masked area for the full number of iterations specified
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
num_inference_steps (`int`, *optional*, defaults to 50):
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
......@@ -996,7 +999,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__(
prompt=prompt,
negative_prompt=negative_prompt,
init_image=init_image,
image=image,
mask_image=mask_image,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
......
......@@ -121,7 +121,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
def inpaint(
self,
prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
......@@ -138,7 +138,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
return StableDiffusionInpaintPipelineLegacy(**self.components)(
prompt=prompt,
init_image=init_image,
image=image,
mask_image=mask_image,
strength=strength,
num_inference_steps=num_inference_steps,
......@@ -156,7 +156,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
def img2img(
self,
prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5,
......@@ -173,7 +173,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
return StableDiffusionImg2ImgPipeline(**self.components)(
prompt=prompt,
init_image=init_image,
image=image,
strength=strength,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
......
......@@ -126,7 +126,7 @@ init_image = init_image.resize((768, 512))
prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png")
```
......
......@@ -435,9 +435,9 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
init_image = init_image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
image = image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents
......@@ -445,16 +445,16 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning."
" your script to pass as many initial images as text prompts to suppress this warning."
)
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
else:
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
......@@ -472,7 +472,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
def __call__(
self,
prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5,
......@@ -484,6 +484,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1,
**kwargs,
):
r"""
Function invoked when calling the pipeline for generation.
......@@ -491,15 +492,15 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
......@@ -540,6 +541,10 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs
self.check_inputs(prompt, strength, callback_steps)
......@@ -557,8 +562,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
)
# 4. Preprocess image
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
......@@ -567,7 +572,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
# 6. Prepare latent variables
latents = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
)
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
......
......@@ -17,7 +17,7 @@ from ...schedulers import (
LMSDiscreteScheduler,
PNDMScheduler,
)
from ...utils import PIL_INTERPOLATION
from ...utils import PIL_INTERPOLATION, deprecate
def preprocess(image):
......@@ -66,7 +66,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
@torch.no_grad()
def __call__(
self,
init_image: Union[torch.Tensor, PIL.Image.Image],
image: Union[torch.Tensor, PIL.Image.Image],
batch_size: Optional[int] = 1,
num_inference_steps: Optional[int] = 100,
eta: Optional[float] = 0.0,
......@@ -77,7 +77,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
) -> Union[Tuple, ImagePipelineOutput]:
r"""
Args:
init_image (`torch.Tensor` or `PIL.Image.Image`):
image (`torch.Tensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
batch_size (`int`, *optional*, defaults to 1):
......@@ -102,20 +102,21 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
`return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
generated images.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(init_image, PIL.Image.Image):
if isinstance(image, PIL.Image.Image):
batch_size = 1
elif isinstance(init_image, torch.Tensor):
batch_size = init_image.shape[0]
elif isinstance(image, torch.Tensor):
batch_size = image.shape[0]
else:
raise ValueError(
f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}"
)
raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
height, width = init_image.shape[-2:]
height, width = image.shape[-2:]
# in_channels should be 6: 3 for latents, 3 for low resolution image
latents_shape = (batch_size, self.unet.in_channels // 2, height, width)
......@@ -128,7 +129,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
else:
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
init_image = init_image.to(device=self.device, dtype=latents_dtype)
image = image.to(device=self.device, dtype=latents_dtype)
# set timesteps and move to the correct device
self.scheduler.set_timesteps(num_inference_steps, device=self.device)
......@@ -148,7 +149,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
for t in self.progress_bar(timesteps_tensor):
# concat latents and low resolution image in the channel dimension.
latents_input = torch.cat([latents, init_image], dim=1)
latents_input = torch.cat([latents, image], dim=1)
latents_input = self.scheduler.scale_model_input(latents_input, t)
# predict the noise residual
noise_pred = self.unet(latents_input, t).sample
......
......@@ -138,7 +138,7 @@ prompt = "An astronaut riding an elephant"
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
init_image=init_image,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.8,
......@@ -164,7 +164,7 @@ torch.manual_seed(0)
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
init_image=init_image,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.85,
......
......@@ -477,9 +477,9 @@ class CycleDiffusionPipeline(DiffusionPipeline):
return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
init_image = init_image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
image = image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents
......@@ -487,16 +487,16 @@ class CycleDiffusionPipeline(DiffusionPipeline):
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning."
" your script to pass as many initial images as text prompts to suppress this warning."
)
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
else:
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
......@@ -516,7 +516,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
self,
prompt: Union[str, List[str]],
source_prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5,
......@@ -528,6 +528,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1,
**kwargs,
):
r"""
Function invoked when calling the pipeline for generation.
......@@ -535,15 +536,15 @@ class CycleDiffusionPipeline(DiffusionPipeline):
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
......@@ -584,6 +585,10 @@ class CycleDiffusionPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs
self.check_inputs(prompt, strength, callback_steps)
......@@ -602,8 +607,8 @@ class CycleDiffusionPipeline(DiffusionPipeline):
)
# 4. Preprocess image
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
# 5. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
......@@ -612,7 +617,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
# 6. Prepare latent variables
latents, clean_latents = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
)
source_latents = latents
......
......@@ -229,7 +229,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
def __call__(
self,
prompt: Union[str, List[str]],
init_image: Union[np.ndarray, PIL.Image.Image],
image: Union[np.ndarray, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5,
......@@ -241,6 +241,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True,
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
callback_steps: Optional[int] = 1,
**kwargs,
):
r"""
Function invoked when calling the pipeline for generation.
......@@ -248,15 +249,15 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
init_image (`np.ndarray` or `PIL.Image.Image`):
image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
......@@ -296,6 +297,10 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str):
batch_size = 1
elif isinstance(prompt, list):
......@@ -320,8 +325,8 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
# set timesteps
self.scheduler.set_timesteps(num_inference_steps)
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
......@@ -333,9 +338,9 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
)
latents_dtype = text_embeddings.dtype
init_image = init_image.astype(latents_dtype)
image = image.astype(latents_dtype)
# encode the init image into latents and scale the latents
init_latents = self.vae_encoder(sample=init_image)[0]
init_latents = self.vae_encoder(sample=image)[0]
init_latents = 0.18215 * init_latents
if isinstance(prompt, str):
......@@ -344,16 +349,16 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning."
" your script to pass as many initial images as text prompts to suppress this warning."
)
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = len(prompt) // init_latents.shape[0]
init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
)
else:
init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
......
......@@ -228,7 +228,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
def __call__(
self,
prompt: Union[str, List[str]],
init_image: Union[np.ndarray, PIL.Image.Image],
image: Union[np.ndarray, PIL.Image.Image],
mask_image: Union[np.ndarray, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
......@@ -241,6 +241,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
return_dict: bool = True,
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
callback_steps: Optional[int] = 1,
**kwargs,
):
r"""
Function invoked when calling the pipeline for generation.
......@@ -248,20 +249,20 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
init_image (`nd.ndarray` or `PIL.Image.Image`):
image (`nd.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted.
mask_image (`nd.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
......@@ -301,6 +302,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str):
batch_size = 1
elif isinstance(prompt, list):
......@@ -325,8 +330,8 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
# set timesteps
self.scheduler.set_timesteps(num_inference_steps)
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
......@@ -338,10 +343,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
)
latents_dtype = text_embeddings.dtype
init_image = init_image.astype(latents_dtype)
image = image.astype(latents_dtype)
# encode the init image into latents and scale the latents
init_latents = self.vae_encoder(sample=init_image)[0]
init_latents = self.vae_encoder(sample=image)[0]
init_latents = 0.18215 * init_latents
# Expand init_latents for batch_size and num_images_per_prompt
......@@ -356,7 +361,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
# check sizes
if not mask.shape == init_latents.shape:
raise ValueError("The mask and init_image should be the same size!")
raise ValueError("The mask and image should be the same size!")
# get the original timestep using init_timestep
offset = self.scheduler.config.get("steps_offset", 0)
......
......@@ -444,9 +444,9 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
init_image = init_image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
image = image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents
......@@ -454,16 +454,16 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
# expand init_latents for batch_size
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning."
" your script to pass as many initial images as text prompts to suppress this warning."
)
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
else:
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
......@@ -481,7 +481,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
def __call__(
self,
prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5,
......@@ -493,6 +493,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1,
**kwargs,
):
r"""
Function invoked when calling the pipeline for generation.
......@@ -500,15 +501,15 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
......@@ -549,6 +550,10 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs
self.check_inputs(prompt, strength, callback_steps)
......@@ -566,8 +571,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
)
# 4. Preprocess image
if isinstance(init_image, PIL.Image.Image):
init_image = preprocess(init_image)
if isinstance(image, PIL.Image.Image):
image = preprocess(image)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
......@@ -576,7 +581,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
# 6. Prepare latent variables
latents = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
)
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
......
......@@ -459,9 +459,9 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
init_image = init_image.to(device=self.device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
image = image.to(device=self.device, dtype=dtype)
init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents
......@@ -479,7 +479,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
def __call__(
self,
prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image],
image: Union[torch.FloatTensor, PIL.Image.Image],
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8,
num_inference_steps: Optional[int] = 50,
......@@ -492,6 +492,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1,
**kwargs,
):
r"""
Function invoked when calling the pipeline for generation.
......@@ -499,19 +500,19 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
is 1, the denoising process will be run on the masked area for the full number of iterations specified
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
num_inference_steps (`int`, *optional*, defaults to 50):
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
......@@ -552,6 +553,10 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
"""
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs
self.check_inputs(prompt, strength, callback_steps)
......@@ -569,8 +574,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
)
# 4. Preprocess image and mask
if not isinstance(init_image, torch.FloatTensor):
init_image = preprocess_image(init_image)
if not isinstance(image, torch.FloatTensor):
image = preprocess_image(image)
if not isinstance(mask_image, torch.FloatTensor):
mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
......@@ -583,7 +588,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
# 6. Prepare latent variables
# encode the init image into latents and scale the latents
latents, init_latents_orig, noise = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
)
# 7. Prepare mask latent
......
......@@ -141,7 +141,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
init_image=init_image,
image=init_image,
)
image = output.images
......@@ -153,7 +153,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
init_image=init_image,
image=init_image,
return_dict=False,
)[0]
......@@ -204,7 +204,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
generator=generator,
num_inference_steps=2,
output_type="np",
init_image=init_image,
image=init_image,
).images
assert image.shape == (1, 32, 32, 3)
......@@ -243,7 +243,7 @@ class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
generator = torch.Generator(device=torch_device).manual_seed(0)
output = pipe(
prompt=prompt,
init_image=init_image,
image=init_image,
strength=0.75,
guidance_scale=7.5,
generator=generator,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment