"vscode:/vscode.git/clone" did not exist on "aa57bd891cc856e249316aea22f0e55f4d146de9"
Unverified Commit 52eb0348 authored by fboulnois's avatar fboulnois Committed by GitHub
Browse files

Standardize on using `image` argument in all pipelines (#1361)

* feat: switch core pipelines to use image arg

* test: update tests for core pipelines

* feat: switch examples to use image arg

* docs: update docs to use image arg

* style: format code using black and doc-builder

* fix: deprecate use of init_image in all pipelines
parent 2bbf8b67
...@@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512)) ...@@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512))
prompt = "A fantasy landscape, trending on artstation" prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png") images[0].save("fantasy_landscape.png")
``` ```
......
...@@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant" ...@@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant"
image = pipe( image = pipe(
prompt=prompt, prompt=prompt,
source_prompt=source_prompt, source_prompt=source_prompt,
init_image=init_image, image=init_image,
num_inference_steps=100, num_inference_steps=100,
eta=0.1, eta=0.1,
strength=0.8, strength=0.8,
...@@ -83,7 +83,7 @@ torch.manual_seed(0) ...@@ -83,7 +83,7 @@ torch.manual_seed(0)
image = pipe( image = pipe(
prompt=prompt, prompt=prompt,
source_prompt=source_prompt, source_prompt=source_prompt,
init_image=init_image, image=init_image,
num_inference_steps=100, num_inference_steps=100,
eta=0.1, eta=0.1,
strength=0.85, strength=0.85,
......
...@@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512)) ...@@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512))
prompt = "A fantasy landscape, trending on artstation" prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png") images[0].save("fantasy_landscape.png")
``` ```
......
...@@ -177,7 +177,7 @@ init_image = download_image( ...@@ -177,7 +177,7 @@ init_image = download_image(
prompt = "A fantasy landscape, trending on artstation" prompt = "A fantasy landscape, trending on artstation"
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
### Inpainting ### Inpainting
...@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512)) ...@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512))
mask_image = download_image(mask_url).resize((512, 512)) mask_image = download_image(mask_url).resize((512, 512))
prompt = "a cat sitting on a bench" prompt = "a cat sitting on a bench"
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
``` ```
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline. As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
......
...@@ -37,7 +37,7 @@ init_image.thumbnail((768, 768)) ...@@ -37,7 +37,7 @@ init_image.thumbnail((768, 768))
prompt = "A fantasy landscape, trending on artstation" prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png") images[0].save("fantasy_landscape.png")
``` ```
......
...@@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di ...@@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di
prompt = "A fantasy landscape, trending on artstation" prompt = "A fantasy landscape, trending on artstation"
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
### Inpainting ### Inpainting
...@@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512)) ...@@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512))
mask_image = download_image(mask_url).resize((512, 512)) mask_image = download_image(mask_url).resize((512, 512))
prompt = "a cat sitting on a bench" prompt = "a cat sitting on a bench"
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
``` ```
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline. As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
...@@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB") ...@@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512)) init_image = init_image.resize((512, 512))
res = pipe.train( res = pipe.train(
prompt, prompt,
init_image, image=init_image,
guidance_scale=7.5, guidance_scale=7.5,
num_inference_steps=50, num_inference_steps=50,
generator=generator) generator=generator)
......
...@@ -17,7 +17,7 @@ from diffusers.pipeline_utils import DiffusionPipeline ...@@ -17,7 +17,7 @@ from diffusers.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from diffusers.utils import logging from diffusers.utils import deprecate, logging
# TODO: remove and import from diffusers.utils when the new version of diffusers is released # TODO: remove and import from diffusers.utils when the new version of diffusers is released
from packaging import version from packaging import version
...@@ -133,7 +133,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): ...@@ -133,7 +133,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
def train( def train(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
height: Optional[int] = 512, height: Optional[int] = 512,
width: Optional[int] = 512, width: Optional[int] = 512,
generator: Optional[torch.Generator] = None, generator: Optional[torch.Generator] = None,
...@@ -184,6 +184,10 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): ...@@ -184,6 +184,10 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
accelerator = Accelerator( accelerator = Accelerator(
gradient_accumulation_steps=1, gradient_accumulation_steps=1,
mixed_precision="fp16", mixed_precision="fp16",
...@@ -241,14 +245,14 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): ...@@ -241,14 +245,14 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
lr=embedding_learning_rate, lr=embedding_learning_rate,
) )
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess(init_image) image = preprocess(image)
latents_dtype = text_embeddings.dtype latents_dtype = text_embeddings.dtype
init_image = init_image.to(device=self.device, dtype=latents_dtype) image = image.to(device=self.device, dtype=latents_dtype)
init_latent_image_dist = self.vae.encode(init_image).latent_dist init_latent_image_dist = self.vae.encode(image).latent_dist
init_image_latents = init_latent_image_dist.sample(generator=generator) image_latents = init_latent_image_dist.sample(generator=generator)
init_image_latents = 0.18215 * init_image_latents image_latents = 0.18215 * image_latents
progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process) progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
progress_bar.set_description("Steps") progress_bar.set_description("Steps")
...@@ -259,12 +263,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): ...@@ -259,12 +263,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
for _ in range(text_embedding_optimization_steps): for _ in range(text_embedding_optimization_steps):
with accelerator.accumulate(text_embeddings): with accelerator.accumulate(text_embeddings):
# Sample noise that we'll add to the latents # Sample noise that we'll add to the latents
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device) noise = torch.randn(image_latents.shape).to(image_latents.device)
timesteps = torch.randint(1000, (1,), device=init_image_latents.device) timesteps = torch.randint(1000, (1,), device=image_latents.device)
# Add noise to the latents according to the noise magnitude at each timestep # Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process) # (this is the forward diffusion process)
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps) noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
# Predict the noise residual # Predict the noise residual
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
...@@ -301,12 +305,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): ...@@ -301,12 +305,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
for _ in range(model_fine_tuning_optimization_steps): for _ in range(model_fine_tuning_optimization_steps):
with accelerator.accumulate(self.unet.parameters()): with accelerator.accumulate(self.unet.parameters()):
# Sample noise that we'll add to the latents # Sample noise that we'll add to the latents
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device) noise = torch.randn(image_latents.shape).to(image_latents.device)
timesteps = torch.randint(1000, (1,), device=init_image_latents.device) timesteps = torch.randint(1000, (1,), device=image_latents.device)
# Add noise to the latents according to the noise magnitude at each timestep # Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process) # (this is the forward diffusion process)
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps) noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
# Predict the noise residual # Predict the noise residual
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
......
...@@ -555,7 +555,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -555,7 +555,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt: Optional[Union[str, List[str]]] = None,
init_image: Union[torch.FloatTensor, PIL.Image.Image] = None, image: Union[torch.FloatTensor, PIL.Image.Image] = None,
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
height: int = 512, height: int = 512,
width: int = 512, width: int = 512,
...@@ -583,11 +583,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -583,11 +583,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
negative_prompt (`str` or `List[str]`, *optional*): negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`). if `guidance_scale` is less than `1`).
init_image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`): mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
...@@ -605,11 +605,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -605,11 +605,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality. usually at the expense of lower image quality.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0): eta (`float`, *optional*, defaults to 0.0):
...@@ -648,6 +648,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -648,6 +648,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str): if isinstance(prompt, str):
batch_size = 1 batch_size = 1
...@@ -714,7 +717,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -714,7 +717,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
mask = None mask = None
noise = None noise = None
if init_image is None: if image is None:
# get the initial random noise unless the user supplied it # get the initial random noise unless the user supplied it
# Unlike in other pipelines, latents need to be generated in the target device # Unlike in other pipelines, latents need to be generated in the target device
...@@ -753,11 +756,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -753,11 +756,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# scale the initial noise by the standard deviation required by the scheduler # scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma latents = latents * self.scheduler.init_noise_sigma
else: else:
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess_image(init_image) image = preprocess_image(image)
# encode the init image into latents and scale the latents # encode the init image into latents and scale the latents
init_image = init_image.to(device=self.device, dtype=latents_dtype) image = image.to(device=self.device, dtype=latents_dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator) init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0) init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
...@@ -772,7 +775,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -772,7 +775,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# check sizes # check sizes
if not mask.shape == init_latents.shape: if not mask.shape == init_latents.shape:
raise ValueError("The mask and init_image should be the same size!") raise ValueError("The mask and image should be the same size!")
# get the original timestep using init_timestep # get the original timestep using init_timestep
offset = self.scheduler.config.get("steps_offset", 0) offset = self.scheduler.config.get("steps_offset", 0)
...@@ -961,7 +964,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -961,7 +964,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def img2img( def img2img(
self, self,
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt: Optional[Union[str, List[str]]] = None,
strength: float = 0.8, strength: float = 0.8,
...@@ -980,7 +983,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -980,7 +983,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r""" r"""
Function for image-to-image generation. Function for image-to-image generation.
Args: Args:
init_image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
...@@ -989,11 +992,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -989,11 +992,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`). if `guidance_scale` is less than `1`).
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter will be modulated by `strength`.
...@@ -1035,7 +1038,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -1035,7 +1038,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__( return self.__call__(
prompt=prompt, prompt=prompt,
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
init_image=init_image, image=image,
num_inference_steps=num_inference_steps, num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale, guidance_scale=guidance_scale,
strength=strength, strength=strength,
...@@ -1052,7 +1055,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -1052,7 +1055,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def inpaint( def inpaint(
self, self,
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
mask_image: Union[torch.FloatTensor, PIL.Image.Image], mask_image: Union[torch.FloatTensor, PIL.Image.Image],
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt: Optional[Union[str, List[str]]] = None,
...@@ -1072,11 +1075,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -1072,11 +1075,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r""" r"""
Function for inpaint. Function for inpaint.
Args: Args:
init_image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted. process. This is the image whose masked region will be inpainted.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`): mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
...@@ -1088,7 +1091,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -1088,7 +1091,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
is 1, the denoising process will be run on the masked area for the full number of iterations specified is 1, the denoising process will be run on the masked area for the full number of iterations specified
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
...@@ -1131,7 +1134,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -1131,7 +1134,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__( return self.__call__(
prompt=prompt, prompt=prompt,
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
init_image=init_image, image=image,
mask_image=mask_image, mask_image=mask_image,
num_inference_steps=num_inference_steps, num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale, guidance_scale=guidance_scale,
......
...@@ -10,7 +10,7 @@ from diffusers.onnx_utils import OnnxRuntimeModel ...@@ -10,7 +10,7 @@ from diffusers.onnx_utils import OnnxRuntimeModel
from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from diffusers.utils import logging from diffusers.utils import deprecate, logging
# TODO: remove and import from diffusers.utils when the new version of diffusers is released # TODO: remove and import from diffusers.utils when the new version of diffusers is released
from packaging import version from packaging import version
...@@ -441,7 +441,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -441,7 +441,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt: Optional[Union[str, List[str]]] = None,
init_image: Union[np.ndarray, PIL.Image.Image] = None, image: Union[np.ndarray, PIL.Image.Image] = None,
mask_image: Union[np.ndarray, PIL.Image.Image] = None, mask_image: Union[np.ndarray, PIL.Image.Image] = None,
height: int = 512, height: int = 512,
width: int = 512, width: int = 512,
...@@ -469,11 +469,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -469,11 +469,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
negative_prompt (`str` or `List[str]`, *optional*): negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`). if `guidance_scale` is less than `1`).
init_image (`np.ndarray` or `PIL.Image.Image`): image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
mask_image (`np.ndarray` or `PIL.Image.Image`): mask_image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
...@@ -491,11 +491,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -491,11 +491,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality. usually at the expense of lower image quality.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0): eta (`float`, *optional*, defaults to 0.0):
...@@ -533,6 +533,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -533,6 +533,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str): if isinstance(prompt, str):
batch_size = 1 batch_size = 1
...@@ -598,7 +601,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -598,7 +601,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
mask = None mask = None
noise = None noise = None
if init_image is None: if image is None:
latents_shape = ( latents_shape = (
batch_size * num_images_per_prompt, batch_size * num_images_per_prompt,
4, 4,
...@@ -616,11 +619,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -616,11 +619,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# scale the initial noise by the standard deviation required by the scheduler # scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma latents = latents * self.scheduler.init_noise_sigma
else: else:
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess_image(init_image) image = preprocess_image(image)
# encode the init image into latents and scale the latents # encode the init image into latents and scale the latents
init_image = init_image.astype(latents_dtype) image = image.astype(latents_dtype)
init_latents = self.vae_encoder(sample=init_image)[0] init_latents = self.vae_encoder(sample=image)[0]
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt) init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
init_latents_orig = init_latents init_latents_orig = init_latents
...@@ -635,7 +638,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -635,7 +638,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
# check sizes # check sizes
if not mask.shape == init_latents.shape: if not mask.shape == init_latents.shape:
print(mask.shape, init_latents.shape) print(mask.shape, init_latents.shape)
raise ValueError("The mask and init_image should be the same size!") raise ValueError("The mask and image should be the same size!")
# get the original timestep using init_timestep # get the original timestep using init_timestep
offset = self.scheduler.config.get("steps_offset", 0) offset = self.scheduler.config.get("steps_offset", 0)
...@@ -828,7 +831,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -828,7 +831,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def img2img( def img2img(
self, self,
init_image: Union[np.ndarray, PIL.Image.Image], image: Union[np.ndarray, PIL.Image.Image],
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt: Optional[Union[str, List[str]]] = None,
strength: float = 0.8, strength: float = 0.8,
...@@ -847,7 +850,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -847,7 +850,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r""" r"""
Function for image-to-image generation. Function for image-to-image generation.
Args: Args:
init_image (`np.ndarray` or `PIL.Image.Image`): image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or ndarray representing an image batch, that will be used as the starting point for the `Image`, or ndarray representing an image batch, that will be used as the starting point for the
process. process.
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
...@@ -856,11 +859,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -856,11 +859,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`). if `guidance_scale` is less than `1`).
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
noise will be maximum and the denoising process will run for the full number of iterations specified in noise will be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter will be modulated by `strength`.
...@@ -901,7 +904,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -901,7 +904,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__( return self.__call__(
prompt=prompt, prompt=prompt,
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
init_image=init_image, image=image,
num_inference_steps=num_inference_steps, num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale, guidance_scale=guidance_scale,
strength=strength, strength=strength,
...@@ -918,7 +921,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -918,7 +921,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
def inpaint( def inpaint(
self, self,
init_image: Union[np.ndarray, PIL.Image.Image], image: Union[np.ndarray, PIL.Image.Image],
mask_image: Union[np.ndarray, PIL.Image.Image], mask_image: Union[np.ndarray, PIL.Image.Image],
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt: Optional[Union[str, List[str]]] = None,
...@@ -938,11 +941,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -938,11 +941,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
r""" r"""
Function for inpaint. Function for inpaint.
Args: Args:
init_image (`np.ndarray` or `PIL.Image.Image`): image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted. process. This is the image whose masked region will be inpainted.
mask_image (`np.ndarray` or `PIL.Image.Image`): mask_image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
...@@ -954,7 +957,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -954,7 +957,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
is 1, the denoising process will be run on the masked area for the full number of iterations specified is 1, the denoising process will be run on the masked area for the full number of iterations specified
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
...@@ -996,7 +999,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): ...@@ -996,7 +999,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
return self.__call__( return self.__call__(
prompt=prompt, prompt=prompt,
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
init_image=init_image, image=image,
mask_image=mask_image, mask_image=mask_image,
num_inference_steps=num_inference_steps, num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale, guidance_scale=guidance_scale,
......
...@@ -121,7 +121,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline): ...@@ -121,7 +121,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
def inpaint( def inpaint(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
mask_image: Union[torch.FloatTensor, PIL.Image.Image], mask_image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
...@@ -138,7 +138,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline): ...@@ -138,7 +138,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
return StableDiffusionInpaintPipelineLegacy(**self.components)( return StableDiffusionInpaintPipelineLegacy(**self.components)(
prompt=prompt, prompt=prompt,
init_image=init_image, image=image,
mask_image=mask_image, mask_image=mask_image,
strength=strength, strength=strength,
num_inference_steps=num_inference_steps, num_inference_steps=num_inference_steps,
...@@ -156,7 +156,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline): ...@@ -156,7 +156,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
def img2img( def img2img(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5, guidance_scale: Optional[float] = 7.5,
...@@ -173,7 +173,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline): ...@@ -173,7 +173,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
return StableDiffusionImg2ImgPipeline(**self.components)( return StableDiffusionImg2ImgPipeline(**self.components)(
prompt=prompt, prompt=prompt,
init_image=init_image, image=image,
strength=strength, strength=strength,
num_inference_steps=num_inference_steps, num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale, guidance_scale=guidance_scale,
......
...@@ -126,7 +126,7 @@ init_image = init_image.resize((768, 512)) ...@@ -126,7 +126,7 @@ init_image = init_image.resize((768, 512))
prompt = "A fantasy landscape, trending on artstation" prompt = "A fantasy landscape, trending on artstation"
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
images[0].save("fantasy_landscape.png") images[0].save("fantasy_landscape.png")
``` ```
......
...@@ -435,9 +435,9 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -435,9 +435,9 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
return timesteps, num_inference_steps - t_start return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
init_image = init_image.to(device=device, dtype=dtype) image = image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator) init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
...@@ -445,16 +445,16 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -445,16 +445,16 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
# expand init_latents for batch_size # expand init_latents for batch_size
deprecation_message = ( deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning." " your script to pass as many initial images as text prompts to suppress this warning."
) )
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = batch_size // init_latents.shape[0] additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0) init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError( raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
) )
else: else:
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0) init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
...@@ -472,7 +472,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -472,7 +472,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
def __call__( def __call__(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5, guidance_scale: Optional[float] = 7.5,
...@@ -484,6 +484,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -484,6 +484,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
**kwargs,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -491,15 +492,15 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -491,15 +492,15 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The will be used as a starting point, adding more noise to it the larger the `strength`. The number of
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
noise will be maximum and the denoising process will run for the full number of iterations specified in be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter will be modulated by `strength`.
...@@ -540,6 +541,10 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -540,6 +541,10 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs # 1. Check inputs
self.check_inputs(prompt, strength, callback_steps) self.check_inputs(prompt, strength, callback_steps)
...@@ -557,8 +562,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -557,8 +562,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
) )
# 4. Preprocess image # 4. Preprocess image
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess(init_image) image = preprocess(image)
# 5. set timesteps # 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device) self.scheduler.set_timesteps(num_inference_steps, device=device)
...@@ -567,7 +572,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -567,7 +572,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
# 6. Prepare latent variables # 6. Prepare latent variables
latents = self.prepare_latents( latents = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
) )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
......
...@@ -17,7 +17,7 @@ from ...schedulers import ( ...@@ -17,7 +17,7 @@ from ...schedulers import (
LMSDiscreteScheduler, LMSDiscreteScheduler,
PNDMScheduler, PNDMScheduler,
) )
from ...utils import PIL_INTERPOLATION from ...utils import PIL_INTERPOLATION, deprecate
def preprocess(image): def preprocess(image):
...@@ -66,7 +66,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): ...@@ -66,7 +66,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
@torch.no_grad() @torch.no_grad()
def __call__( def __call__(
self, self,
init_image: Union[torch.Tensor, PIL.Image.Image], image: Union[torch.Tensor, PIL.Image.Image],
batch_size: Optional[int] = 1, batch_size: Optional[int] = 1,
num_inference_steps: Optional[int] = 100, num_inference_steps: Optional[int] = 100,
eta: Optional[float] = 0.0, eta: Optional[float] = 0.0,
...@@ -77,7 +77,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): ...@@ -77,7 +77,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
) -> Union[Tuple, ImagePipelineOutput]: ) -> Union[Tuple, ImagePipelineOutput]:
r""" r"""
Args: Args:
init_image (`torch.Tensor` or `PIL.Image.Image`): image (`torch.Tensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
batch_size (`int`, *optional*, defaults to 1): batch_size (`int`, *optional*, defaults to 1):
...@@ -102,20 +102,21 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): ...@@ -102,20 +102,21 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
`return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
generated images. generated images.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
batch_size = 1 batch_size = 1
elif isinstance(init_image, torch.Tensor): elif isinstance(image, torch.Tensor):
batch_size = init_image.shape[0] batch_size = image.shape[0]
else: else:
raise ValueError( raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}"
)
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess(init_image) image = preprocess(image)
height, width = init_image.shape[-2:] height, width = image.shape[-2:]
# in_channels should be 6: 3 for latents, 3 for low resolution image # in_channels should be 6: 3 for latents, 3 for low resolution image
latents_shape = (batch_size, self.unet.in_channels // 2, height, width) latents_shape = (batch_size, self.unet.in_channels // 2, height, width)
...@@ -128,7 +129,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): ...@@ -128,7 +129,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
else: else:
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype) latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
init_image = init_image.to(device=self.device, dtype=latents_dtype) image = image.to(device=self.device, dtype=latents_dtype)
# set timesteps and move to the correct device # set timesteps and move to the correct device
self.scheduler.set_timesteps(num_inference_steps, device=self.device) self.scheduler.set_timesteps(num_inference_steps, device=self.device)
...@@ -148,7 +149,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): ...@@ -148,7 +149,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
for t in self.progress_bar(timesteps_tensor): for t in self.progress_bar(timesteps_tensor):
# concat latents and low resolution image in the channel dimension. # concat latents and low resolution image in the channel dimension.
latents_input = torch.cat([latents, init_image], dim=1) latents_input = torch.cat([latents, image], dim=1)
latents_input = self.scheduler.scale_model_input(latents_input, t) latents_input = self.scheduler.scale_model_input(latents_input, t)
# predict the noise residual # predict the noise residual
noise_pred = self.unet(latents_input, t).sample noise_pred = self.unet(latents_input, t).sample
......
...@@ -138,7 +138,7 @@ prompt = "An astronaut riding an elephant" ...@@ -138,7 +138,7 @@ prompt = "An astronaut riding an elephant"
image = pipe( image = pipe(
prompt=prompt, prompt=prompt,
source_prompt=source_prompt, source_prompt=source_prompt,
init_image=init_image, image=init_image,
num_inference_steps=100, num_inference_steps=100,
eta=0.1, eta=0.1,
strength=0.8, strength=0.8,
...@@ -164,7 +164,7 @@ torch.manual_seed(0) ...@@ -164,7 +164,7 @@ torch.manual_seed(0)
image = pipe( image = pipe(
prompt=prompt, prompt=prompt,
source_prompt=source_prompt, source_prompt=source_prompt,
init_image=init_image, image=init_image,
num_inference_steps=100, num_inference_steps=100,
eta=0.1, eta=0.1,
strength=0.85, strength=0.85,
......
...@@ -477,9 +477,9 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -477,9 +477,9 @@ class CycleDiffusionPipeline(DiffusionPipeline):
return timesteps, num_inference_steps - t_start return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
init_image = init_image.to(device=device, dtype=dtype) image = image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator) init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
...@@ -487,16 +487,16 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -487,16 +487,16 @@ class CycleDiffusionPipeline(DiffusionPipeline):
# expand init_latents for batch_size # expand init_latents for batch_size
deprecation_message = ( deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning." " your script to pass as many initial images as text prompts to suppress this warning."
) )
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = batch_size // init_latents.shape[0] additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0) init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError( raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
) )
else: else:
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0) init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
...@@ -516,7 +516,7 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -516,7 +516,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
source_prompt: Union[str, List[str]], source_prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5, guidance_scale: Optional[float] = 7.5,
...@@ -528,6 +528,7 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -528,6 +528,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
**kwargs,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -535,15 +536,15 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -535,15 +536,15 @@ class CycleDiffusionPipeline(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The will be used as a starting point, adding more noise to it the larger the `strength`. The number of
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
noise will be maximum and the denoising process will run for the full number of iterations specified in be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter will be modulated by `strength`.
...@@ -584,6 +585,10 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -584,6 +585,10 @@ class CycleDiffusionPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs # 1. Check inputs
self.check_inputs(prompt, strength, callback_steps) self.check_inputs(prompt, strength, callback_steps)
...@@ -602,8 +607,8 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -602,8 +607,8 @@ class CycleDiffusionPipeline(DiffusionPipeline):
) )
# 4. Preprocess image # 4. Preprocess image
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess(init_image) image = preprocess(image)
# 5. Prepare timesteps # 5. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device) self.scheduler.set_timesteps(num_inference_steps, device=device)
...@@ -612,7 +617,7 @@ class CycleDiffusionPipeline(DiffusionPipeline): ...@@ -612,7 +617,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
# 6. Prepare latent variables # 6. Prepare latent variables
latents, clean_latents = self.prepare_latents( latents, clean_latents = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
) )
source_latents = latents source_latents = latents
......
...@@ -229,7 +229,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -229,7 +229,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
def __call__( def __call__(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[np.ndarray, PIL.Image.Image], image: Union[np.ndarray, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5, guidance_scale: Optional[float] = 7.5,
...@@ -241,6 +241,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -241,6 +241,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, np.ndarray], None]] = None, callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
**kwargs,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -248,15 +249,15 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -248,15 +249,15 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
init_image (`np.ndarray` or `PIL.Image.Image`): image (`np.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The will be used as a starting point, adding more noise to it the larger the `strength`. The number of
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
noise will be maximum and the denoising process will run for the full number of iterations specified in be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter will be modulated by `strength`.
...@@ -296,6 +297,10 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -296,6 +297,10 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str): if isinstance(prompt, str):
batch_size = 1 batch_size = 1
elif isinstance(prompt, list): elif isinstance(prompt, list):
...@@ -320,8 +325,8 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -320,8 +325,8 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
# set timesteps # set timesteps
self.scheduler.set_timesteps(num_inference_steps) self.scheduler.set_timesteps(num_inference_steps)
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess(init_image) image = preprocess(image)
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
...@@ -333,9 +338,9 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -333,9 +338,9 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
) )
latents_dtype = text_embeddings.dtype latents_dtype = text_embeddings.dtype
init_image = init_image.astype(latents_dtype) image = image.astype(latents_dtype)
# encode the init image into latents and scale the latents # encode the init image into latents and scale the latents
init_latents = self.vae_encoder(sample=init_image)[0] init_latents = self.vae_encoder(sample=image)[0]
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
if isinstance(prompt, str): if isinstance(prompt, str):
...@@ -344,16 +349,16 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -344,16 +349,16 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
# expand init_latents for batch_size # expand init_latents for batch_size
deprecation_message = ( deprecation_message = (
f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial" f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning." " your script to pass as many initial images as text prompts to suppress this warning."
) )
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = len(prompt) // init_latents.shape[0] additional_image_per_prompt = len(prompt) // init_latents.shape[0]
init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0: elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
raise ValueError( raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
) )
else: else:
init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0) init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
......
...@@ -228,7 +228,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -228,7 +228,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
def __call__( def __call__(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[np.ndarray, PIL.Image.Image], image: Union[np.ndarray, PIL.Image.Image],
mask_image: Union[np.ndarray, PIL.Image.Image], mask_image: Union[np.ndarray, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
...@@ -241,6 +241,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -241,6 +241,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, np.ndarray], None]] = None, callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
**kwargs,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -248,20 +249,20 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -248,20 +249,20 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
init_image (`nd.ndarray` or `PIL.Image.Image`): image (`nd.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted. process. This is the image whose masked region will be inpainted.
mask_image (`nd.ndarray` or `PIL.Image.Image`): mask_image (`nd.ndarray` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The will be used as a starting point, adding more noise to it the larger the `strength`. The number of
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
noise will be maximum and the denoising process will run for the full number of iterations specified in be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter will be modulated by `strength`.
...@@ -301,6 +302,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -301,6 +302,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
if isinstance(prompt, str): if isinstance(prompt, str):
batch_size = 1 batch_size = 1
elif isinstance(prompt, list): elif isinstance(prompt, list):
...@@ -325,8 +330,8 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -325,8 +330,8 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
# set timesteps # set timesteps
self.scheduler.set_timesteps(num_inference_steps) self.scheduler.set_timesteps(num_inference_steps)
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess(init_image) image = preprocess(image)
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
...@@ -338,10 +343,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -338,10 +343,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
) )
latents_dtype = text_embeddings.dtype latents_dtype = text_embeddings.dtype
init_image = init_image.astype(latents_dtype) image = image.astype(latents_dtype)
# encode the init image into latents and scale the latents # encode the init image into latents and scale the latents
init_latents = self.vae_encoder(sample=init_image)[0] init_latents = self.vae_encoder(sample=image)[0]
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
# Expand init_latents for batch_size and num_images_per_prompt # Expand init_latents for batch_size and num_images_per_prompt
...@@ -356,7 +361,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -356,7 +361,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
# check sizes # check sizes
if not mask.shape == init_latents.shape: if not mask.shape == init_latents.shape:
raise ValueError("The mask and init_image should be the same size!") raise ValueError("The mask and image should be the same size!")
# get the original timestep using init_timestep # get the original timestep using init_timestep
offset = self.scheduler.config.get("steps_offset", 0) offset = self.scheduler.config.get("steps_offset", 0)
......
...@@ -444,9 +444,9 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -444,9 +444,9 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
return timesteps, num_inference_steps - t_start return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
init_image = init_image.to(device=device, dtype=dtype) image = image.to(device=device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator) init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
...@@ -454,16 +454,16 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -454,16 +454,16 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
# expand init_latents for batch_size # expand init_latents for batch_size
deprecation_message = ( deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many init images as text prompts to suppress this warning." " your script to pass as many initial images as text prompts to suppress this warning."
) )
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_image_per_prompt = batch_size // init_latents.shape[0] additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0) init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError( raise ValueError(
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
) )
else: else:
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0) init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
...@@ -481,7 +481,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -481,7 +481,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
def __call__( def __call__(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
guidance_scale: Optional[float] = 7.5, guidance_scale: Optional[float] = 7.5,
...@@ -493,6 +493,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -493,6 +493,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
**kwargs,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -500,15 +501,15 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -500,15 +501,15 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. process.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The will be used as a starting point, adding more noise to it the larger the `strength`. The number of
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
noise will be maximum and the denoising process will run for the full number of iterations specified in be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter will be modulated by `strength`.
...@@ -549,6 +550,10 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -549,6 +550,10 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs # 1. Check inputs
self.check_inputs(prompt, strength, callback_steps) self.check_inputs(prompt, strength, callback_steps)
...@@ -566,8 +571,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -566,8 +571,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
) )
# 4. Preprocess image # 4. Preprocess image
if isinstance(init_image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
init_image = preprocess(init_image) image = preprocess(image)
# 5. set timesteps # 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device) self.scheduler.set_timesteps(num_inference_steps, device=device)
...@@ -576,7 +581,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): ...@@ -576,7 +581,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
# 6. Prepare latent variables # 6. Prepare latent variables
latents = self.prepare_latents( latents = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
) )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
......
...@@ -459,9 +459,9 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -459,9 +459,9 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
return timesteps, num_inference_steps - t_start return timesteps, num_inference_steps - t_start
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator): def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
init_image = init_image.to(device=self.device, dtype=dtype) image = image.to(device=self.device, dtype=dtype)
init_latent_dist = self.vae.encode(init_image).latent_dist init_latent_dist = self.vae.encode(image).latent_dist
init_latents = init_latent_dist.sample(generator=generator) init_latents = init_latent_dist.sample(generator=generator)
init_latents = 0.18215 * init_latents init_latents = 0.18215 * init_latents
...@@ -479,7 +479,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -479,7 +479,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
def __call__( def __call__(
self, self,
prompt: Union[str, List[str]], prompt: Union[str, List[str]],
init_image: Union[torch.FloatTensor, PIL.Image.Image], image: Union[torch.FloatTensor, PIL.Image.Image],
mask_image: Union[torch.FloatTensor, PIL.Image.Image], mask_image: Union[torch.FloatTensor, PIL.Image.Image],
strength: float = 0.8, strength: float = 0.8,
num_inference_steps: Optional[int] = 50, num_inference_steps: Optional[int] = 50,
...@@ -492,6 +492,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -492,6 +492,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
**kwargs,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -499,19 +500,19 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -499,19 +500,19 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
init_image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image`, or tensor representing an image batch, that will be used as the starting point for the
process. This is the image whose masked region will be inpainted. process. This is the image whose masked region will be inpainted.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`): mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
is 1, the denoising process will be run on the masked area for the full number of iterations specified is 1, the denoising process will be run on the masked area for the full number of iterations specified
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
the expense of slower inference. This parameter will be modulated by `strength`, as explained above. the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
...@@ -552,6 +553,10 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -552,6 +553,10 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`. (nsfw) content, according to the `safety_checker`.
""" """
message = "Please use `image` instead of `init_image`."
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
image = init_image or image
# 1. Check inputs # 1. Check inputs
self.check_inputs(prompt, strength, callback_steps) self.check_inputs(prompt, strength, callback_steps)
...@@ -569,8 +574,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -569,8 +574,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
) )
# 4. Preprocess image and mask # 4. Preprocess image and mask
if not isinstance(init_image, torch.FloatTensor): if not isinstance(image, torch.FloatTensor):
init_image = preprocess_image(init_image) image = preprocess_image(image)
if not isinstance(mask_image, torch.FloatTensor): if not isinstance(mask_image, torch.FloatTensor):
mask_image = preprocess_mask(mask_image, self.vae_scale_factor) mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
...@@ -583,7 +588,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): ...@@ -583,7 +588,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
# 6. Prepare latent variables # 6. Prepare latent variables
# encode the init image into latents and scale the latents # encode the init image into latents and scale the latents
latents, init_latents_orig, noise = self.prepare_latents( latents, init_latents_orig, noise = self.prepare_latents(
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
) )
# 7. Prepare mask latent # 7. Prepare mask latent
......
...@@ -141,7 +141,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas ...@@ -141,7 +141,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
guidance_scale=6.0, guidance_scale=6.0,
num_inference_steps=2, num_inference_steps=2,
output_type="np", output_type="np",
init_image=init_image, image=init_image,
) )
image = output.images image = output.images
...@@ -153,7 +153,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas ...@@ -153,7 +153,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
guidance_scale=6.0, guidance_scale=6.0,
num_inference_steps=2, num_inference_steps=2,
output_type="np", output_type="np",
init_image=init_image, image=init_image,
return_dict=False, return_dict=False,
)[0] )[0]
...@@ -204,7 +204,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas ...@@ -204,7 +204,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
generator=generator, generator=generator,
num_inference_steps=2, num_inference_steps=2,
output_type="np", output_type="np",
init_image=init_image, image=init_image,
).images ).images
assert image.shape == (1, 32, 32, 3) assert image.shape == (1, 32, 32, 3)
...@@ -243,7 +243,7 @@ class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase): ...@@ -243,7 +243,7 @@ class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
generator = torch.Generator(device=torch_device).manual_seed(0) generator = torch.Generator(device=torch_device).manual_seed(0)
output = pipe( output = pipe(
prompt=prompt, prompt=prompt,
init_image=init_image, image=init_image,
strength=0.75, strength=0.75,
guidance_scale=7.5, guidance_scale=7.5,
generator=generator, generator=generator,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment