"vscode:/vscode.git/clone" did not exist on "cbbad0af6942f1b46a5c1807edb48adb703ba96e"
Unverified Commit 4a343077 authored by Sayak Paul's avatar Sayak Paul Committed by GitHub
Browse files

add: utility to format our docs too 📜 (#7314)

* add: utility to format our docs too 📜

* debugging saga

* fix: message

* checking

* should be fixed.

* revert pipeline_fixture

* remove empty line

* make style

* fix: setup.py

* style.
parent 8e963d1c
...@@ -1000,8 +1000,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): ...@@ -1000,8 +1000,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
def fuse_qkv_projections(self): def fuse_qkv_projections(self):
""" """
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
key, value) are fused. For cross-attention modules, key and value projection matrices are fused. are fused. For cross-attention modules, key and value projection matrices are fused.
<Tip warning={true}> <Tip warning={true}>
...@@ -1112,8 +1112,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): ...@@ -1112,8 +1112,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
Returns: Returns:
[`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
a `tuple` is returned where the first element is the sample tensor. otherwise a `tuple` is returned where the first element is the sample tensor.
""" """
# By default samples have to be AT least a multiple of the overall upsampling factor. # By default samples have to be AT least a multiple of the overall upsampling factor.
# The overall upsampling factor is equal to 2 ** (# num of upsampling layers). # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
......
...@@ -41,20 +41,20 @@ class FreeInitMixin: ...@@ -41,20 +41,20 @@ class FreeInitMixin:
num_iters (`int`, *optional*, defaults to `3`): num_iters (`int`, *optional*, defaults to `3`):
Number of FreeInit noise re-initialization iterations. Number of FreeInit noise re-initialization iterations.
use_fast_sampling (`bool`, *optional*, defaults to `False`): use_fast_sampling (`bool`, *optional*, defaults to `False`):
Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the
the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`. "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
method (`str`, *optional*, defaults to `butterworth`): method (`str`, *optional*, defaults to `butterworth`):
Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low
FreeInit low pass filter. pass filter.
order (`int`, *optional*, defaults to `4`): order (`int`, *optional*, defaults to `4`):
Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
whereas lower values lead to `gaussian` method behaviour. whereas lower values lead to `gaussian` method behaviour.
spatial_stop_frequency (`float`, *optional*, defaults to `0.25`): spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the
the original implementation. original implementation.
temporal_stop_frequency (`float`, *optional*, defaults to `0.25`): temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the
the original implementation. original implementation.
""" """
self._free_init_num_iters = num_iters self._free_init_num_iters = num_iters
self._free_init_use_fast_sampling = use_fast_sampling self._free_init_use_fast_sampling = use_fast_sampling
......
...@@ -43,10 +43,14 @@ EXAMPLE_DOC_STRING = """ ...@@ -43,10 +43,14 @@ EXAMPLE_DOC_STRING = """
>>> from diffusers import I2VGenXLPipeline >>> from diffusers import I2VGenXLPipeline
>>> from diffusers.utils import export_to_gif, load_image >>> from diffusers.utils import export_to_gif, load_image
>>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") >>> pipeline = I2VGenXLPipeline.from_pretrained(
... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
... )
>>> pipeline.enable_model_cpu_offload() >>> pipeline.enable_model_cpu_offload()
>>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" >>> image_url = (
... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
... )
>>> image = load_image(image_url).convert("RGB") >>> image = load_image(image_url).convert("RGB")
>>> prompt = "Papers were floating in the air on a table in the library" >>> prompt = "Papers were floating in the air on a table in the library"
...@@ -59,7 +63,7 @@ EXAMPLE_DOC_STRING = """ ...@@ -59,7 +63,7 @@ EXAMPLE_DOC_STRING = """
... num_inference_steps=50, ... num_inference_steps=50,
... negative_prompt=negative_prompt, ... negative_prompt=negative_prompt,
... guidance_scale=9.0, ... guidance_scale=9.0,
... generator=generator ... generator=generator,
... ).frames[0] ... ).frames[0]
>>> video_path = export_to_gif(frames, "i2v.gif") >>> video_path = export_to_gif(frames, "i2v.gif")
``` ```
...@@ -95,7 +99,8 @@ class I2VGenXLPipelineOutput(BaseOutput): ...@@ -95,7 +99,8 @@ class I2VGenXLPipelineOutput(BaseOutput):
Args: Args:
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
denoised
PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
`(batch_size, num_frames, channels, height, width)` `(batch_size, num_frames, channels, height, width)`
""" """
...@@ -551,7 +556,8 @@ class I2VGenXLPipeline( ...@@ -551,7 +556,8 @@ class I2VGenXLPipeline(
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image. The width in pixels of the generated image.
target_fps (`int`, *optional*): target_fps (`int`, *optional*):
Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation. Frames per second. The rate at which the generated images shall be exported to a video after
generation. This is also used as a "micro-condition" while generation.
num_frames (`int`, *optional*): num_frames (`int`, *optional*):
The number of video frames to generate. The number of video frames to generate.
num_inference_steps (`int`, *optional*): num_inference_steps (`int`, *optional*):
...@@ -568,9 +574,9 @@ class I2VGenXLPipeline( ...@@ -568,9 +574,9 @@ class I2VGenXLPipeline(
num_videos_per_prompt (`int`, *optional*): num_videos_per_prompt (`int`, *optional*):
The number of images to generate per prompt. The number of images to generate per prompt.
decode_chunk_size (`int`, *optional*): decode_chunk_size (`int`, *optional*):
The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency The number of frames to decode at a time. The higher the chunk size, the higher the temporal
between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once consistency between frames, but also the higher the memory consumption. By default, the decoder will
for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic. generation deterministic.
......
...@@ -35,10 +35,10 @@ DYNAMIC_MAP = { ...@@ -35,10 +35,10 @@ DYNAMIC_MAP = {
def convert_state_dict(unet_state_dict): def convert_state_dict(unet_state_dict):
""" """
Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
Args: Args:
unet_model (torch.nn.Module): The original U-Net model. Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with. unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet
model to match keys with.
Returns: Returns:
OrderedDict: The converted state dictionary. OrderedDict: The converted state dictionary.
......
...@@ -24,7 +24,9 @@ EXAMPLE_DOC_STRING = """ ...@@ -24,7 +24,9 @@ EXAMPLE_DOC_STRING = """
>>> from diffusers import AutoPipelineForText2Image >>> from diffusers import AutoPipelineForText2Image
>>> import torch >>> import torch
>>> pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) >>> pipe = AutoPipelineForText2Image.from_pretrained(
... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
... )
>>> pipe.enable_model_cpu_offload() >>> pipe.enable_model_cpu_offload()
>>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
......
...@@ -29,11 +29,15 @@ EXAMPLE_DOC_STRING = """ ...@@ -29,11 +29,15 @@ EXAMPLE_DOC_STRING = """
>>> from diffusers.utils import load_image >>> from diffusers.utils import load_image
>>> import torch >>> import torch
>>> pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) >>> pipe = AutoPipelineForImage2Image.from_pretrained(
... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
... )
>>> pipe.enable_model_cpu_offload() >>> pipe.enable_model_cpu_offload()
>>> prompt = "A painting of the inside of a subway train with tiny raccoons." >>> prompt = "A painting of the inside of a subway train with tiny raccoons."
>>> image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png") >>> image = load_image(
... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
... )
>>> generator = torch.Generator(device="cpu").manual_seed(0) >>> generator = torch.Generator(device="cpu").manual_seed(0)
>>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0] >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
......
...@@ -73,8 +73,8 @@ def retrieve_timesteps( ...@@ -73,8 +73,8 @@ def retrieve_timesteps(
scheduler (`SchedulerMixin`): scheduler (`SchedulerMixin`):
The scheduler to get timesteps from. The scheduler to get timesteps from.
num_inference_steps (`int`): num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
`timesteps` must be `None`. must be `None`.
device (`str` or `torch.device`, *optional*): device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*): timesteps (`List[int]`, *optional*):
...@@ -749,10 +749,10 @@ class LatentConsistencyModelImg2ImgPipeline( ...@@ -749,10 +749,10 @@ class LatentConsistencyModelImg2ImgPipeline(
ip_adapter_image: (`PipelineImageInput`, *optional*): ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters. Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
if `do_classifier_free_guidance` is set to `True`. contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
If not provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
......
...@@ -77,8 +77,8 @@ def retrieve_timesteps( ...@@ -77,8 +77,8 @@ def retrieve_timesteps(
scheduler (`SchedulerMixin`): scheduler (`SchedulerMixin`):
The scheduler to get timesteps from. The scheduler to get timesteps from.
num_inference_steps (`int`): num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
`timesteps` must be `None`. must be `None`.
device (`str` or `torch.device`, *optional*): device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*): timesteps (`List[int]`, *optional*):
...@@ -681,10 +681,10 @@ class LatentConsistencyModelPipeline( ...@@ -681,10 +681,10 @@ class LatentConsistencyModelPipeline(
ip_adapter_image: (`PipelineImageInput`, *optional*): ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters. Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
if `do_classifier_free_guidance` is set to `True`. contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
If not provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
......
...@@ -40,30 +40,21 @@ EXAMPLE_DOC_STRING = """ ...@@ -40,30 +40,21 @@ EXAMPLE_DOC_STRING = """
>>> from io import BytesIO >>> from io import BytesIO
>>> from diffusers import LEditsPPPipelineStableDiffusion >>> from diffusers import LEditsPPPipelineStableDiffusion
>>> from diffusers.utils import load_image
>>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained( >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 ... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
... ) ... )
>>> pipe = pipe.to("cuda") >>> pipe = pipe.to("cuda")
>>> def download_image(url):
... response = requests.get(url)
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
>>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png" >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
>>> image = download_image(img_url) >>> image = load_image(img_url).convert("RGB")
>>> _ = pipe.invert( >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1)
... image = image,
... num_inversion_steps=50,
... skip=0.1
... )
>>> edited_image = pipe( >>> edited_image = pipe(
... editing_prompt=["cherry blossom"], ... editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75
... edit_guidance_scale=10.0, ... ).images[0]
... edit_threshold=0.75,
).images[0]
``` ```
""" """
...@@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion( ...@@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion(
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]): scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
be set to [`DPMSolverMultistepScheduler`]. automatically be set to [`DPMSolverMultistepScheduler`].
safety_checker ([`StableDiffusionSafetyChecker`]): safety_checker ([`StableDiffusionSafetyChecker`]):
Classification module that estimates whether generated images could be considered offensive or harmful. Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
...@@ -531,8 +522,7 @@ class LEditsPPPipelineStableDiffusion( ...@@ -531,8 +522,7 @@ class LEditsPPPipelineStableDiffusion(
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
editing_prompt (`str` or `List[str]`, *optional*): editing_prompt (`str` or `List[str]`, *optional*):
Editing prompt(s) to be encoded. If not defined, one has to pass Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead.
`editing_prompt_embeds` instead.
editing_prompt_embeds (`torch.FloatTensor`, *optional*): editing_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument. provided, text embeddings will be generated from `prompt` input argument.
...@@ -734,8 +724,9 @@ class LEditsPPPipelineStableDiffusion( ...@@ -734,8 +724,9 @@ class LEditsPPPipelineStableDiffusion(
**kwargs, **kwargs,
): ):
r""" r"""
The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] The call function to the pipeline for editing. The
method has to be called beforehand. Edits will always be performed for the last inverted image(s). [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will
always be performed for the last inverted image(s).
Args: Args:
negative_prompt (`str` or `List[str]`, *optional*): negative_prompt (`str` or `List[str]`, *optional*):
...@@ -748,49 +739,51 @@ class LEditsPPPipelineStableDiffusion( ...@@ -748,49 +739,51 @@ class LEditsPPPipelineStableDiffusion(
The output format of the generate image. Choose between The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain
plain tuple. tuple.
editing_prompt (`str` or `List[str]`, *optional*): editing_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide the image generation. The image is reconstructed by setting The prompt or prompts to guide the image generation. The image is reconstructed by setting
`editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`. `editing_prompt = None`. Guidance direction of prompt should be specified via
`reverse_editing_direction`.
editing_prompt_embeds (`torch.Tensor>`, *optional*): editing_prompt_embeds (`torch.Tensor>`, *optional*):
Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should
specified via `reverse_editing_direction`. be specified via `reverse_editing_direction`.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
Whether the corresponding prompt in `editing_prompt` should be increased or decreased. Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5): edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`. Guidance scale for guiding the image generation. If provided as list values should correspond to
`edit_guidance_scale` is defined as `s_e` of equation 12 of `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
[LEDITS++ Paper](https://arxiv.org/abs/2301.12247). Paper](https://arxiv.org/abs/2301.12247).
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10): edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
Number of diffusion steps (for each prompt) for which guidance will not be applied. Number of diffusion steps (for each prompt) for which guidance will not be applied.
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`): edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
Number of diffusion steps (for each prompt) after which guidance will no longer be applied. Number of diffusion steps (for each prompt) after which guidance will no longer be applied.
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9): edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
Masking threshold of guidance. Threshold should be proportional to the image region that is modified. Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
Paper](https://arxiv.org/abs/2301.12247).
user_mask (`torch.FloatTensor`, *optional*): user_mask (`torch.FloatTensor`, *optional*):
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
masks do not meet user preferences. implicit masks do not meet user preferences.
sem_guidance (`List[torch.Tensor]`, *optional*): sem_guidance (`List[torch.Tensor]`, *optional*):
List of pre-generated guidance vectors to be applied at generation. Length of the list has to List of pre-generated guidance vectors to be applied at generation. Length of the list has to
correspond to `num_inference_steps`. correspond to `num_inference_steps`.
use_cross_attn_mask (`bool`, defaults to `False`): use_cross_attn_mask (`bool`, defaults to `False`):
Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
[LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). paper](https://arxiv.org/pdf/2311.16711.pdf).
use_intersect_mask (`bool`, defaults to `True`): use_intersect_mask (`bool`, defaults to `True`):
Whether the masking term is calculated as intersection of cross-attention masks and masks derived Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
attn_store_steps (`List[int]`, *optional*): attn_store_steps (`List[int]`, *optional*):
Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes. Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
store_averaged_over_steps (`bool`, defaults to `True`): store_averaged_over_steps (`bool`, defaults to `True`):
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
If False, attention maps for each step are stores separately. Just for visualization purposes. False, attention maps for each step are stores separately. Just for visualization purposes.
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
...@@ -815,10 +808,10 @@ class LEditsPPPipelineStableDiffusion( ...@@ -815,10 +808,10 @@ class LEditsPPPipelineStableDiffusion(
Returns: Returns:
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`: [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the returning a tuple, the first element is a list with the generated images, and the second element is a list
second element is a list of `bool`s denoting whether the corresponding generated image likely represents of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
"not-safe-for-work" (nsfw) content, according to the `safety_checker`. content, according to the `safety_checker`.
""" """
if self.inversion_steps is None: if self.inversion_steps is None:
...@@ -1219,9 +1212,9 @@ class LEditsPPPipelineStableDiffusion( ...@@ -1219,9 +1212,9 @@ class LEditsPPPipelineStableDiffusion(
crops_coords: Optional[Tuple[int, int, int, int]] = None, crops_coords: Optional[Tuple[int, int, int, int]] = None,
): ):
r""" r"""
The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). The function to the pipeline for image inversion as described by the [LEDITS++
If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
will be performed instead. inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
Args: Args:
image (`PipelineImageInput`): image (`PipelineImageInput`):
...@@ -1238,8 +1231,8 @@ class LEditsPPPipelineStableDiffusion( ...@@ -1238,8 +1231,8 @@ class LEditsPPPipelineStableDiffusion(
Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
will lead to stronger changes to the input image. `skip` has to be between `0` and `1`. will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
inversion deterministic. deterministic.
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
...@@ -1247,23 +1240,24 @@ class LEditsPPPipelineStableDiffusion( ...@@ -1247,23 +1240,24 @@ class LEditsPPPipelineStableDiffusion(
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings. the output of the pre-final layer will be used for computing the prompt embeddings.
height (`int`, *optional*, defaults to `None`): height (`int`, *optional*, defaults to `None`):
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height. The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
height.
width (`int`, *optional*`, defaults to `None`): width (`int`, *optional*`, defaults to `None`):
The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
resize_mode (`str`, *optional*, defaults to `default`): resize_mode (`str`, *optional*, defaults to `default`):
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
within the specified width and height, and it may not maintaining the original aspect ratio. the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
within the dimensions, filling empty with data from image. center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image image to fit within the specified width and height, maintaining the aspect ratio, and then center the
within the dimensions, cropping the excess. image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
Note that resize_mode `fill` and `crop` are only supported for PIL image input. supported for PIL image input.
crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`): crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
The crop coordinates for each image in the batch. If `None`, will not crop the image. The crop coordinates for each image in the batch. If `None`, will not crop the image.
Returns: Returns:
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
Output will contain the resized input image(s) and respective VAE reconstruction(s). and respective VAE reconstruction(s).
""" """
# Reset attn processor, we do not want to store attn maps during inversion # Reset attn processor, we do not want to store attn maps during inversion
self.unet.set_attn_processor(AttnProcessor()) self.unet.set_attn_processor(AttnProcessor())
......
...@@ -85,25 +85,23 @@ EXAMPLE_DOC_STRING = """ ...@@ -85,25 +85,23 @@ EXAMPLE_DOC_STRING = """
... ) ... )
>>> pipe = pipe.to("cuda") >>> pipe = pipe.to("cuda")
>>> def download_image(url): >>> def download_image(url):
... response = requests.get(url) ... response = requests.get(url)
... return PIL.Image.open(BytesIO(response.content)).convert("RGB") ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
>>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg" >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
>>> image = download_image(img_url) >>> image = download_image(img_url)
>>> _ = pipe.invert( >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2)
... image = image,
... num_inversion_steps=50,
... skip=0.2
... )
>>> edited_image = pipe( >>> edited_image = pipe(
... editing_prompt=["tennis ball","tomato"], ... editing_prompt=["tennis ball", "tomato"],
... reverse_editing_direction=[True,False], ... reverse_editing_direction=[True, False],
... edit_guidance_scale=[5.0,10.0], ... edit_guidance_scale=[5.0, 10.0],
... edit_threshold=[0.9,0.85], ... edit_threshold=[0.9, 0.85],
).images[0] ... ).images[0]
``` ```
""" """
...@@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL(
""" """
Pipeline for textual image editing using LEDits++ with Stable Diffusion XL. Pipeline for textual image editing using LEDits++ with Stable Diffusion XL.
This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the superclass This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the
documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a
device, etc.). particular device, etc.).
In addition the pipeline inherits the following loading methods: In addition the pipeline inherits the following loading methods:
- *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`] - *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`]
...@@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL(
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]): scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
be set to [`DPMSolverMultistepScheduler`]. automatically be set to [`DPMSolverMultistepScheduler`].
force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`): force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
`stabilityai/stable-diffusion-xl-base-1-0`. `stabilityai/stable-diffusion-xl-base-1-0`.
...@@ -453,9 +451,9 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -453,9 +451,9 @@ class LEditsPPPipelineStableDiffusionXL(
Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass
`editing_prompt_embeds` instead. `editing_prompt_embeds` instead.
editing_prompt_embeds (`torch.FloatTensor`, *optional*): editing_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from
argument. `editing_prompt` input argument.
editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt` weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt`
...@@ -835,8 +833,9 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -835,8 +833,9 @@ class LEditsPPPipelineStableDiffusionXL(
**kwargs, **kwargs,
): ):
r""" r"""
The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] The call function to the pipeline for editing. The
method has to be called beforehand. Edits will always be performed for the last inverted image(s). [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits
will always be performed for the last inverted image(s).
Args: Args:
denoising_end (`float`, *optional*): denoising_end (`float`, *optional*):
...@@ -894,11 +893,11 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -894,11 +893,11 @@ class LEditsPPPipelineStableDiffusionXL(
section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
editing_prompt (`str` or `List[str]`, *optional*): editing_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide the image generation. The image is reconstructed by setting The prompt or prompts to guide the image generation. The image is reconstructed by setting
`editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`. `editing_prompt = None`. Guidance direction of prompt should be specified via
`reverse_editing_direction`.
editing_prompt_embeddings (`torch.Tensor`, *optional*): editing_prompt_embeddings (`torch.Tensor`, *optional*):
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument.
argument.
editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*): editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*):
Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
...@@ -906,35 +905,36 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -906,35 +905,36 @@ class LEditsPPPipelineStableDiffusionXL(
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
Whether the corresponding prompt in `editing_prompt` should be increased or decreased. Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5): edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`. Guidance scale for guiding the image generation. If provided as list values should correspond to
`edit_guidance_scale` is defined as `s_e` of equation 12 of `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
[LEDITS++ Paper](https://arxiv.org/abs/2301.12247). Paper](https://arxiv.org/abs/2301.12247).
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10): edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
Number of diffusion steps (for each prompt) for which guidance is not applied. Number of diffusion steps (for each prompt) for which guidance is not applied.
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`): edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
Number of diffusion steps (for each prompt) after which guidance is no longer applied. Number of diffusion steps (for each prompt) after which guidance is no longer applied.
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9): edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
Masking threshold of guidance. Threshold should be proportional to the image region that is modified. Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
Paper](https://arxiv.org/abs/2301.12247).
sem_guidance (`List[torch.Tensor]`, *optional*): sem_guidance (`List[torch.Tensor]`, *optional*):
List of pre-generated guidance vectors to be applied at generation. Length of the list has to List of pre-generated guidance vectors to be applied at generation. Length of the list has to
correspond to `num_inference_steps`. correspond to `num_inference_steps`.
use_cross_attn_mask: use_cross_attn_mask:
Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
[LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). paper](https://arxiv.org/pdf/2311.16711.pdf).
use_intersect_mask: use_intersect_mask:
Whether the masking term is calculated as intersection of cross-attention masks and masks derived Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
user_mask: user_mask:
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
masks do not meet user preferences. implicit masks do not meet user preferences.
attn_store_steps: attn_store_steps:
Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes. Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
store_averaged_over_steps: store_averaged_over_steps:
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
If False, attention maps for each step are stores separately. Just for visualization purposes. False, attention maps for each step are stores separately. Just for visualization purposes.
clip_skip (`int`, *optional*): clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings. the output of the pre-final layer will be used for computing the prompt embeddings.
...@@ -952,8 +952,8 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -952,8 +952,8 @@ class LEditsPPPipelineStableDiffusionXL(
Returns: Returns:
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`: [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. returning a tuple, the first element is a list with the generated images.
""" """
if self.inversion_steps is None: if self.inversion_steps is None:
raise ValueError( raise ValueError(
...@@ -1446,9 +1446,9 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -1446,9 +1446,9 @@ class LEditsPPPipelineStableDiffusionXL(
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
): ):
r""" r"""
The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). The function to the pipeline for image inversion as described by the [LEDITS++
If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
will be performed instead. inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
Args: Args:
image (`PipelineImageInput`): image (`PipelineImageInput`):
...@@ -1472,8 +1472,8 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -1472,8 +1472,8 @@ class LEditsPPPipelineStableDiffusionXL(
Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
will lead to stronger changes to the input image. `skip` has to be between `0` and `1`. will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
inversion deterministic. deterministic.
crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
`crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
`crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
...@@ -1488,8 +1488,8 @@ class LEditsPPPipelineStableDiffusionXL( ...@@ -1488,8 +1488,8 @@ class LEditsPPPipelineStableDiffusionXL(
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
Returns: Returns:
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
Output will contain the resized input image(s) and respective VAE reconstruction(s). and respective VAE reconstruction(s).
""" """
# Reset attn processor, we do not want to store attn maps during inversion # Reset attn processor, we do not want to store attn maps during inversion
......
...@@ -35,8 +35,8 @@ class LEditsPPInversionPipelineOutput(BaseOutput): ...@@ -35,8 +35,8 @@ class LEditsPPInversionPipelineOutput(BaseOutput):
List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape ` List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape `
(batch_size, height, width, num_channels)`. (batch_size, height, width, num_channels)`.
vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`) vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`)
List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape ` List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape
(batch_size, height, width, num_channels)`. ` (batch_size, height, width, num_channels)`.
""" """
images: Union[List[PIL.Image.Image], np.ndarray] images: Union[List[PIL.Image.Image], np.ndarray]
......
...@@ -59,6 +59,7 @@ EXAMPLE_DOC_STRING = """ ...@@ -59,6 +59,7 @@ EXAMPLE_DOC_STRING = """
... PIAPipeline, ... PIAPipeline,
... ) ... )
>>> from diffusers.utils import export_to_gif, load_image >>> from diffusers.utils import export_to_gif, load_image
>>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers") >>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers")
>>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter) >>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter)
>>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
...@@ -135,9 +136,9 @@ class PIAPipelineOutput(BaseOutput): ...@@ -135,9 +136,9 @@ class PIAPipelineOutput(BaseOutput):
Args: Args:
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of
NumPy array of shape `(batch_size, num_frames, channels, height, width, shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames,
Torch tensor of shape `(batch_size, num_frames, channels, height, width)`. channels, height, width)`.
""" """
frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
...@@ -759,16 +760,15 @@ class PIAPipeline( ...@@ -759,16 +760,15 @@ class PIAPipeline(
ip_adapter_image: (`PipelineImageInput`, *optional*): ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters. Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
if `do_classifier_free_guidance` is set to `True`. contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
If not provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
motion_scale: (`int`, *optional*, defaults to 0): motion_scale: (`int`, *optional*, defaults to 0):
Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific Parameter that controls the amount and type of motion that is added to the image. Increasing the value
ranges of values control the type of motion that is added. Must be between 0 and 8. increases the amount of motion, while specific ranges of values control the type of motion that is
Set between 0-2 to only increase the amount of motion. added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5
Set between 3-5 to create looping motion. to create looping motion. Set between 6-8 to perform motion with image style transfer.
Set between 6-8 to perform motion with image style transfer.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
`np.array`. `np.array`.
...@@ -795,8 +795,8 @@ class PIAPipeline( ...@@ -795,8 +795,8 @@ class PIAPipeline(
Returns: Returns:
[`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`: [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is returned, otherwise a
returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. `tuple` is returned where the first element is a list with the generated frames.
""" """
# 0. Default height and width to unet # 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor height = height or self.unet.config.sample_size * self.vae_scale_factor
......
...@@ -538,7 +538,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin): ...@@ -538,7 +538,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
allowed by Git. allowed by Git.
custom_revision (`str`, *optional*): custom_revision (`str`, *optional*):
The specific model version to use. It can be a branch name, a tag name, or a commit id similar to The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
`revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version. `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers
version.
mirror (`str`, *optional*): mirror (`str`, *optional*):
Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
...@@ -1669,7 +1670,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin): ...@@ -1669,7 +1670,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
@classmethod @classmethod
def from_pipe(cls, pipeline, **kwargs): def from_pipe(cls, pipeline, **kwargs):
r""" r"""
Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing pipeline components without reallocating additional memory. Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing
pipeline components without reallocating additional memory.
Arguments: Arguments:
pipeline (`DiffusionPipeline`): pipeline (`DiffusionPipeline`):
...@@ -1851,8 +1853,8 @@ class StableDiffusionMixin: ...@@ -1851,8 +1853,8 @@ class StableDiffusionMixin:
def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
""" """
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
key, value) are fused. For cross-attention modules, key and value projection matrices are fused. are fused. For cross-attention modules, key and value projection matrices are fused.
<Tip warning={true}> <Tip warning={true}>
......
...@@ -186,8 +186,8 @@ def retrieve_timesteps( ...@@ -186,8 +186,8 @@ def retrieve_timesteps(
scheduler (`SchedulerMixin`): scheduler (`SchedulerMixin`):
The scheduler to get timesteps from. The scheduler to get timesteps from.
num_inference_steps (`int`): num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
`timesteps` must be `None`. must be `None`.
device (`str` or `torch.device`, *optional*): device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*): timesteps (`List[int]`, *optional*):
......
...@@ -334,8 +334,8 @@ class StableCascadeDecoderPipeline(DiffusionPipeline): ...@@ -334,8 +334,8 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
argument. argument.
negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
argument. input argument.
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
......
...@@ -31,7 +31,10 @@ TEXT2IMAGE_EXAMPLE_DOC_STRING = """ ...@@ -31,7 +31,10 @@ TEXT2IMAGE_EXAMPLE_DOC_STRING = """
```py ```py
>>> import torch >>> import torch
>>> from diffusers import StableCascadeCombinedPipeline >>> from diffusers import StableCascadeCombinedPipeline
>>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16)
>>> pipe = StableCascadeCombinedPipeline.from_pretrained(
... "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
... )
>>> pipe.enable_model_cpu_offload() >>> pipe.enable_model_cpu_offload()
>>> prompt = "an image of a shiba inu, donning a spacesuit and helmet" >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
>>> images = pipe(prompt=prompt) >>> images = pipe(prompt=prompt)
......
...@@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline): ...@@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline):
prior ([`StableCascadeUNet`]): prior ([`StableCascadeUNet`]):
The Stable Cascade prior to approximate the image embedding from the text and/or image embedding. The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
text_encoder ([`CLIPTextModelWithProjection`]): text_encoder ([`CLIPTextModelWithProjection`]):
Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)). Frozen text-encoder
([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
feature_extractor ([`~transformers.CLIPImageProcessor`]): feature_extractor ([`~transformers.CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `image_encoder`. Model that extracts features from generated images to be used as inputs for the `image_encoder`.
image_encoder ([`CLIPVisionModelWithProjection`]): image_encoder ([`CLIPVisionModelWithProjection`]):
...@@ -420,11 +421,11 @@ class StableCascadePriorPipeline(DiffusionPipeline): ...@@ -420,11 +421,11 @@ class StableCascadePriorPipeline(DiffusionPipeline):
argument. argument.
negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
argument. input argument.
image_embeds (`torch.FloatTensor`, *optional*): image_embeds (`torch.FloatTensor`, *optional*):
Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If
If not provided, image embeddings will be generated from `image` input argument if existing. not provided, image embeddings will be generated from `image` input argument if existing.
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
...@@ -452,9 +453,9 @@ class StableCascadePriorPipeline(DiffusionPipeline): ...@@ -452,9 +453,9 @@ class StableCascadePriorPipeline(DiffusionPipeline):
Examples: Examples:
Returns: Returns:
[`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is
`return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
generated image embeddings. embeddings.
""" """
# 0. Define commonly used variables # 0. Define commonly used variables
......
...@@ -85,8 +85,8 @@ def retrieve_timesteps( ...@@ -85,8 +85,8 @@ def retrieve_timesteps(
scheduler (`SchedulerMixin`): scheduler (`SchedulerMixin`):
The scheduler to get timesteps from. The scheduler to get timesteps from.
num_inference_steps (`int`): num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
`timesteps` must be `None`. must be `None`.
device (`str` or `torch.device`, *optional*): device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*): timesteps (`List[int]`, *optional*):
...@@ -801,10 +801,10 @@ class StableDiffusionPipeline( ...@@ -801,10 +801,10 @@ class StableDiffusionPipeline(
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
if `do_classifier_free_guidance` is set to `True`. contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
If not provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
......
...@@ -125,8 +125,8 @@ def retrieve_timesteps( ...@@ -125,8 +125,8 @@ def retrieve_timesteps(
scheduler (`SchedulerMixin`): scheduler (`SchedulerMixin`):
The scheduler to get timesteps from. The scheduler to get timesteps from.
num_inference_steps (`int`): num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
`timesteps` must be `None`. must be `None`.
device (`str` or `torch.device`, *optional*): device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*): timesteps (`List[int]`, *optional*):
...@@ -897,10 +897,10 @@ class StableDiffusionImg2ImgPipeline( ...@@ -897,10 +897,10 @@ class StableDiffusionImg2ImgPipeline(
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
if `do_classifier_free_guidance` is set to `True`. contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
If not provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
......
...@@ -189,8 +189,8 @@ def retrieve_timesteps( ...@@ -189,8 +189,8 @@ def retrieve_timesteps(
scheduler (`SchedulerMixin`): scheduler (`SchedulerMixin`):
The scheduler to get timesteps from. The scheduler to get timesteps from.
num_inference_steps (`int`): num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
`timesteps` must be `None`. must be `None`.
device (`str` or `torch.device`, *optional*): device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*): timesteps (`List[int]`, *optional*):
...@@ -1022,11 +1022,12 @@ class StableDiffusionInpaintPipeline( ...@@ -1022,11 +1022,12 @@ class StableDiffusionInpaintPipeline(
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image. The width in pixels of the generated image.
padding_mask_crop (`int`, *optional*, defaults to `None`): padding_mask_crop (`int`, *optional*, defaults to `None`):
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
`padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on with the same aspect ration of the image and contains all masked area, and then expand that area based
the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
and contain information irrelevant for inpainting, such as background. resizing to the original image size for inpainting. This is useful when the masked area is small while
the image is large and contain information irrelevant for inpainting, such as background.
strength (`float`, *optional*, defaults to 1.0): strength (`float`, *optional*, defaults to 1.0):
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
starting point and more noise is added the higher the `strength`. The number of denoising steps depends starting point and more noise is added the higher the `strength`. The number of denoising steps depends
...@@ -1066,10 +1067,10 @@ class StableDiffusionInpaintPipeline( ...@@ -1066,10 +1067,10 @@ class StableDiffusionInpaintPipeline(
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
if `do_classifier_free_guidance` is set to `True`. contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
If not provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment