Unverified Commit a69754bb authored by Steven Liu's avatar Steven Liu Committed by GitHub
Browse files

[docs] Clean up pipeline apis (#3905)

* start with stable diffusion

* fix

* finish stable diffusion pipelines

* fix path to pipeline output

* fix flax paths

* fix copies

* add up to score sde ve

* finish first pass of pipelines

* fix copies

* second review

* align doc titles

* more review fixes

* final review
parent bcc570b9
...@@ -26,13 +26,16 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name ...@@ -26,13 +26,16 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
class DanceDiffusionPipeline(DiffusionPipeline): class DanceDiffusionPipeline(DiffusionPipeline):
r""" r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the Pipeline for audio generation.
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded audio. unet ([`UNet1DModel`]):
A `UNet1DModel` to denoise the encoded audio.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded audio. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
[`IPNDMScheduler`]. [`IPNDMScheduler`].
""" """
...@@ -50,6 +53,8 @@ class DanceDiffusionPipeline(DiffusionPipeline): ...@@ -50,6 +53,8 @@ class DanceDiffusionPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
) -> Union[AudioPipelineOutput, Tuple]: ) -> Union[AudioPipelineOutput, Tuple]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
batch_size (`int`, *optional*, defaults to 1): batch_size (`int`, *optional*, defaults to 1):
The number of audio samples to generate. The number of audio samples to generate.
...@@ -57,17 +62,40 @@ class DanceDiffusionPipeline(DiffusionPipeline): ...@@ -57,17 +62,40 @@ class DanceDiffusionPipeline(DiffusionPipeline):
The number of denoising steps. More denoising steps usually lead to a higher-quality audio sample at The number of denoising steps. More denoising steps usually lead to a higher-quality audio sample at
the expense of slower inference. the expense of slower inference.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`): audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.* The length of the generated audio sample in seconds.
`sample_size`, will be `audio_length_in_s` * `self.unet.config.sample_rate`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
Example:
```py
from diffusers import DiffusionPipeline
from scipy.io.wavfile import write
model_id = "harmonai/maestro-150k"
pipe = DiffusionPipeline.from_pretrained(model_id)
pipe = pipe.to("cuda")
audios = pipe(audio_length_in_s=4.0).audios
# To save locally
for i, audio in enumerate(audios):
write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
# To dislay in google colab
import IPython.display as ipd
for audio in audios:
display(ipd.Audio(audio, rate=pipe.unet.sample_rate))
```
Returns: Returns:
[`~pipelines.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if `return_dict` is [`~pipelines.AudioPipelineOutput`] or `tuple`:
True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated audio. If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated audio.
""" """
if audio_length_in_s is None: if audio_length_in_s is None:
......
...@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput ...@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class DDIMPipeline(DiffusionPipeline): class DDIMPipeline(DiffusionPipeline):
r""" r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the Pipeline for image generation.
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
[`DDPMScheduler`], or [`DDIMScheduler`]. [`DDPMScheduler`], or [`DDIMScheduler`].
...@@ -53,29 +56,56 @@ class DDIMPipeline(DiffusionPipeline): ...@@ -53,29 +56,56 @@ class DDIMPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]: ) -> Union[ImagePipelineOutput, Tuple]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
batch_size (`int`, *optional*, defaults to 1): batch_size (`int`, *optional*, defaults to 1):
The number of images to generate. The number of images to generate.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
eta (`float`, *optional*, defaults to 0.0): eta (`float`, *optional*, defaults to 0.0):
The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM). Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to
DDIM and `1` corresponds to DDPM.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
use_clipped_model_output (`bool`, *optional*, defaults to `None`): use_clipped_model_output (`bool`, *optional*, defaults to `None`):
if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed
downstream to the scheduler. So use `None` for schedulers which don't support this argument. downstream to the scheduler (use `None` for schedulers which don't support this argument).
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import DDIMPipeline
>>> import PIL.Image
>>> import numpy as np
>>> # load model and scheduler
>>> pipe = DDIMPipeline.from_pretrained("fusing/ddim-lsun-bedroom")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pipe(eta=0.0, num_inference_steps=50)
>>> # process image to PIL
>>> image_processed = image.cpu().permute(0, 2, 3, 1)
>>> image_processed = (image_processed + 1.0) * 127.5
>>> image_processed = image_processed.numpy().astype(np.uint8)
>>> image_pil = PIL.Image.fromarray(image_processed[0])
>>> # save image
>>> image_pil.save("test.png")
```
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
""" """
# Sample gaussian noise to begin loop # Sample gaussian noise to begin loop
......
...@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput ...@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class DDPMPipeline(DiffusionPipeline): class DDPMPipeline(DiffusionPipeline):
r""" r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the Pipeline for image generation.
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
[`DDPMScheduler`], or [`DDIMScheduler`]. [`DDPMScheduler`], or [`DDIMScheduler`].
...@@ -47,24 +50,41 @@ class DDPMPipeline(DiffusionPipeline): ...@@ -47,24 +50,41 @@ class DDPMPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]: ) -> Union[ImagePipelineOutput, Tuple]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
batch_size (`int`, *optional*, defaults to 1): batch_size (`int`, *optional*, defaults to 1):
The number of images to generate. The number of images to generate.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 1000): num_inference_steps (`int`, *optional*, defaults to 1000):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import DDPMPipeline
>>> # load model and scheduler
>>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pipe().images[0]
>>> # save image
>>> image.save("ddpm_generated_image.png")
```
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
""" """
# Sample gaussian noise to begin loop # Sample gaussian noise to begin loop
if isinstance(self.unet.config.sample_size, int): if isinstance(self.unet.config.sample_size, int):
......
...@@ -30,16 +30,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput ...@@ -30,16 +30,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class DiTPipeline(DiffusionPipeline): class DiTPipeline(DiffusionPipeline):
r""" r"""
This pipeline inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the Pipeline for image generation based on a Transformer backbone instead of a UNet.
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
transformer ([`Transformer2DModel`]): transformer ([`Transformer2DModel`]):
Class conditioned Transformer in Diffusion model to denoise the encoded image latents. A class conditioned `Transformer2DModel` to denoise the encoded image latents.
vae ([`AutoencoderKL`]): vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
scheduler ([`DDIMScheduler`]): scheduler ([`DDIMScheduler`]):
A scheduler to be used in combination with `dit` to denoise the encoded image latents. A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
""" """
def __init__( def __init__(
...@@ -63,13 +65,15 @@ class DiTPipeline(DiffusionPipeline): ...@@ -63,13 +65,15 @@ class DiTPipeline(DiffusionPipeline):
def get_label_ids(self, label: Union[str, List[str]]) -> List[int]: def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
r""" r"""
Map label strings, *e.g.* from ImageNet, to corresponding class ids. Map label strings from ImageNet to corresponding class ids.
Parameters: Parameters:
label (`str` or `dict` of `str`): label strings to be mapped to class ids. label (`str` or `dict` of `str`):
Label strings to be mapped to class ids.
Returns: Returns:
`list` of `int`: Class ids to be processed by pipeline. `list` of `int`:
Class ids to be processed by pipeline.
""" """
if not isinstance(label, list): if not isinstance(label, list):
...@@ -94,24 +98,53 @@ class DiTPipeline(DiffusionPipeline): ...@@ -94,24 +98,53 @@ class DiTPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]: ) -> Union[ImagePipelineOutput, Tuple]:
r""" r"""
Function invoked when calling the pipeline for generation. The call function to the pipeline for generation.
Args: Args:
class_labels (List[int]): class_labels (List[int]):
List of imagenet class labels for the images to be generated. List of ImageNet class labels for the images to be generated.
guidance_scale (`float`, *optional*, defaults to 4.0): guidance_scale (`float`, *optional*, defaults to 4.0):
Scale of the guidance signal. A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
deterministic. generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 250): num_inference_steps (`int`, *optional*, defaults to 250):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Examples:
```py
>>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
>>> import torch
>>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
>>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
>>> pipe = pipe.to("cuda")
>>> # pick words from Imagenet class labels
>>> pipe.labels # to print all available words
>>> # pick words that exist in ImageNet
>>> words = ["white shark", "umbrella"]
>>> class_ids = pipe.get_label_ids(words)
>>> generator = torch.manual_seed(33)
>>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
>>> image = output.images[0] # label 'white shark'
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
""" """
batch_size = len(class_labels) batch_size = len(class_labels)
......
...@@ -31,18 +31,20 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput ...@@ -31,18 +31,20 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class LDMTextToImagePipeline(DiffusionPipeline): class LDMTextToImagePipeline(DiffusionPipeline):
r""" r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the Pipeline for text-to-image generation using latent diffusion.
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
vqvae ([`VQModel`]): vqvae ([`VQModel`]):
Vector-quantized (VQ) Model to encode and decode images to and from latent representations. Vector-quantized (VQ) model to encode and decode images to and from latent representations.
bert ([`LDMBertModel`]): bert ([`LDMBertModel`]):
Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture. Text-encoder model based on [`~transformers.BERT`].
tokenizer (`transformers.BertTokenizer`): tokenizer ([`~transformers.BertTokenizer`]):
Tokenizer of class A `BertTokenizer` to tokenize text.
[BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer). unet ([`UNet2DConditionModel`]):
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
...@@ -76,38 +78,54 @@ class LDMTextToImagePipeline(DiffusionPipeline): ...@@ -76,38 +78,54 @@ class LDMTextToImagePipeline(DiffusionPipeline):
**kwargs, **kwargs,
) -> Union[Tuple, ImagePipelineOutput]: ) -> Union[Tuple, ImagePipelineOutput]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The height in pixels of the generated image. The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image. The width in pixels of the generated image.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
guidance_scale (`float`, *optional*, defaults to 1.0): guidance_scale (`float`, *optional*, defaults to 1.0):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). A higher guidance scale value encourages the model to generate images closely linked to the text
`guidance_scale` is defined as `w` of equation 2. of [Imagen `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt` at
the, usually at the expense of lower image quality.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`. tensor is generated by sampling using the supplied random `generator`.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`):
return_dict (`bool`, *optional*): Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import DiffusionPipeline
>>> # load model and scheduler
>>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
>>> # run pipeline in inference (sample random noise and denoise)
>>> prompt = "A painting of a squirrel eating a burger"
>>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images
>>> # save images
>>> for idx, image in enumerate(images):
... image.save(f"squirrel-{idx}.png")
```
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
""" """
# 0. Default height and width to unet # 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor height = height or self.unet.config.sample_size * self.vae_scale_factor
......
...@@ -31,15 +31,16 @@ def preprocess(image): ...@@ -31,15 +31,16 @@ def preprocess(image):
class LDMSuperResolutionPipeline(DiffusionPipeline): class LDMSuperResolutionPipeline(DiffusionPipeline):
r""" r"""
A pipeline for image super-resolution using Latent A pipeline for image super-resolution using latent diffusion.
This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
vqvae ([`VQModel`]): vqvae ([`VQModel`]):
Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations. Vector-quantized (VQ) model to encode and decode images to and from latent representations.
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
...@@ -74,30 +75,58 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): ...@@ -74,30 +75,58 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
) -> Union[Tuple, ImagePipelineOutput]: ) -> Union[Tuple, ImagePipelineOutput]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
image (`torch.Tensor` or `PIL.Image.Image`): image (`torch.Tensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image` or tensor representing an image batch to be used as the starting point for the process.
process.
batch_size (`int`, *optional*, defaults to 1): batch_size (`int`, *optional*, defaults to 1):
Number of images to generate. Number of images to generate.
num_inference_steps (`int`, *optional*, defaults to 100): num_inference_steps (`int`, *optional*, defaults to 100):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
eta (`float`, *optional*, defaults to 0.0): eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
[`schedulers.DDIMScheduler`], will be ignored for others. to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`):
return_dict (`bool`, *optional*): Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> import requests
>>> from PIL import Image
>>> from io import BytesIO
>>> from diffusers import LDMSuperResolutionPipeline
>>> import torch
>>> # load model and scheduler
>>> pipeline = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
>>> pipeline = pipeline.to("cuda")
>>> # let's download an image
>>> url = (
... "https://user-images.githubusercontent.com/38061659/199705896-b48e17b8-b231-47cd-a270-4ffa5a93fa3e.png"
... )
>>> response = requests.get(url)
>>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
>>> low_res_img = low_res_img.resize((128, 128))
>>> # run pipeline in inference (sample random noise and denoise)
>>> upscaled_image = pipeline(low_res_img, num_inference_steps=100, eta=1).images[0]
>>> # save image
>>> upscaled_image.save("ldm_generated_image.png")
```
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
""" """
if isinstance(image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
batch_size = 1 batch_size = 1
......
...@@ -25,15 +25,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput ...@@ -25,15 +25,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class LDMPipeline(DiffusionPipeline): class LDMPipeline(DiffusionPipeline):
r""" r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the Pipeline for unconditional image generation using latent diffusion.
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
vqvae ([`VQModel`]): vqvae ([`VQModel`]):
Vector-quantized (VQ) Model to encode and decode images to and from latent representations. Vector-quantized (VQ) model to encode and decode images to and from latent representations.
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents. unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
[`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents. [`DDIMScheduler`] is used in combination with `unet` to denoise the encoded image latents.
""" """
def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler): def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
...@@ -52,24 +55,38 @@ class LDMPipeline(DiffusionPipeline): ...@@ -52,24 +55,38 @@ class LDMPipeline(DiffusionPipeline):
**kwargs, **kwargs,
) -> Union[Tuple, ImagePipelineOutput]: ) -> Union[Tuple, ImagePipelineOutput]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
batch_size (`int`, *optional*, defaults to 1): batch_size (`int`, *optional*, defaults to 1):
Number of images to generate. Number of images to generate.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import LDMPipeline
>>> # load model and scheduler
>>> pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pipe().images[0]
```
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
""" """
latents = randn_tensor( latents = randn_tensor(
......
...@@ -136,28 +136,36 @@ def prepare_mask_and_masked_image(image, mask): ...@@ -136,28 +136,36 @@ def prepare_mask_and_masked_image(image, mask):
class PaintByExamplePipeline(DiffusionPipeline): class PaintByExamplePipeline(DiffusionPipeline):
r""" r"""
Pipeline for image-guided image inpainting using Stable Diffusion. *This is an experimental feature*. <Tip warning={true}>
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the 🧪 This is an experimental feature!
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
</Tip>
Pipeline for image-guided image inpainting using Stable Diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args: Args:
vae ([`AutoencoderKL`]): vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
image_encoder ([`PaintByExampleImageEncoder`]): image_encoder ([`PaintByExampleImageEncoder`]):
Encodes the example input image. The unet is conditioned on the example image instead of a text prompt. Encodes the example input image. The `unet` is conditioned on the example image instead of a text prompt.
tokenizer (`CLIPTokenizer`): tokenizer ([`~transformers.CLIPTokenizer`]):
Tokenizer of class A `CLIPTokenizer` to tokenize text.
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]):
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
safety_checker ([`StableDiffusionSafetyChecker`]): safety_checker ([`StableDiffusionSafetyChecker`]):
Classification module that estimates whether generated images could be considered offensive or harmful. Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
feature_extractor ([`CLIPImageProcessor`]): about a model's potential harms.
Model that extracts features from generated images to be used as inputs for the `safety_checker`. feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
# TODO: feature_extractor is required to encode initial images (if they are in PIL format), # TODO: feature_extractor is required to encode initial images (if they are in PIL format),
# we should give a descriptive message if the pipeline doesn't have one. # we should give a descriptive message if the pipeline doesn't have one.
...@@ -378,66 +386,99 @@ class PaintByExamplePipeline(DiffusionPipeline): ...@@ -378,66 +386,99 @@ class PaintByExamplePipeline(DiffusionPipeline):
callback_steps: int = 1, callback_steps: int = 1,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. The call function to the pipeline for generation.
Args: Args:
example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
The exemplar image to guide the image generation. An example image to guide image generation.
image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
`Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with
be masked out with `mask_image` and repainted according to `prompt`. `mask_image` and repainted according to `prompt`).
mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted,
repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
instead of 3, so the expected shape would be `(B, H, W, 1)`. expected shape would be `(B, H, W, 1)`.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The height in pixels of the generated image. The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image. The width in pixels of the generated image.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
guidance_scale (`float`, *optional*, defaults to 7.5): guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). A higher guidance scale value encourages the model to generate images closely linked to the text
`guidance_scale` is defined as `w` of equation 2. of [Imagen `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
negative_prompt (`str` or `List[str]`, *optional*): negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored The prompt or prompts to guide what to not include in image generation. If not defined, you need to
if `guidance_scale` is less than `1`). pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0): eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
[`schedulers.DDIMScheduler`], will be ignored for others. to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`. tensor is generated by sampling using the supplied random `generator`.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
plain tuple. plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. The function will be A function that calls every `callback_steps` steps during inference. The function is called with the
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be The frequency at which the `callback` function is called. If not specified, the callback is called at
called at every step. every step.
Example:
```py
>>> import PIL
>>> import requests
>>> import torch
>>> from io import BytesIO
>>> from diffusers import PaintByExamplePipeline
>>> def download_image(url):
... response = requests.get(url)
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
>>> img_url = (
... "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
... )
>>> mask_url = (
... "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
... )
>>> example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
>>> init_image = download_image(img_url).resize((512, 512))
>>> mask_image = download_image(mask_url).resize((512, 512))
>>> example_image = download_image(example_url).resize((512, 512))
>>> pipe = PaintByExamplePipeline.from_pretrained(
... "Fantasy-Studio/Paint-by-Example",
... torch_dtype=torch.float16,
... )
>>> pipe = pipe.to("cuda")
>>> image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
>>> image
```
Returns: Returns:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
When returning a tuple, the first element is a list with the generated images, and the second element is a otherwise a `tuple` is returned where the first element is a list with the generated images and the
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" second element is a list of `bool`s indicating whether the corresponding generated image contains
(nsfw) content, according to the `safety_checker`. "not-safe-for-work" (nsfw) content.
""" """
# 1. Define call parameters # 1. Define call parameters
if isinstance(image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
......
...@@ -92,18 +92,17 @@ class FlaxImagePipelineOutput(BaseOutput): ...@@ -92,18 +92,17 @@ class FlaxImagePipelineOutput(BaseOutput):
class FlaxDiffusionPipeline(ConfigMixin): class FlaxDiffusionPipeline(ConfigMixin):
r""" r"""
Base class for all models. Base class for Flax-based pipelines.
[`FlaxDiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion [`FlaxDiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and
pipelines and handles methods for loading, downloading and saving models as well as a few methods common to all provides methods for loading, downloading and saving models. It also includes methods to:
pipelines to:
- enabling/disabling the progress bar for the denoising iteration - enable/disable the progress bar for the denoising iteration
Class attributes: Class attributes:
- **config_name** ([`str`]) -- name of the config file that will store the class and module names of all - **config_name** ([`str`]) -- The configuration filename that stores the class and module names of all the
components of the diffusion pipeline. diffusion pipeline's components.
""" """
config_name = "model_index.json" config_name = "model_index.json"
...@@ -143,10 +142,9 @@ class FlaxDiffusionPipeline(ConfigMixin): ...@@ -143,10 +142,9 @@ class FlaxDiffusionPipeline(ConfigMixin):
def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union[Dict, FrozenDict]): def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union[Dict, FrozenDict]):
# TODO: handle inference_state # TODO: handle inference_state
""" """
Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading class implements both a save and loading method. The pipeline is easily reloaded using the
method. The pipeline can easily be re-loaded using the `[`~FlaxDiffusionPipeline.from_pretrained`]` class [`~FlaxDiffusionPipeline.from_pretrained`] class method.
method.
Arguments: Arguments:
save_directory (`str` or `os.PathLike`): save_directory (`str` or `os.PathLike`):
...@@ -193,70 +191,61 @@ class FlaxDiffusionPipeline(ConfigMixin): ...@@ -193,70 +191,61 @@ class FlaxDiffusionPipeline(ConfigMixin):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
r""" r"""
Instantiate a Flax diffusion pipeline from pre-trained pipeline weights. Instantiate a Flax-based diffusion pipeline from pretrained pipeline weights.
The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). The pipeline is set in evaluation mode (`model.eval()) by default and dropout modules are deactivated.
The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come If you get the error message below, you need to finetune the weights for your downstream task:
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.
The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those ```
weights are discarded. Some weights of FlaxUNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
```
Parameters: Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
Can be either: Can be either:
- A string, the *repo id* of a pretrained pipeline hosted inside a model repo on - A string, the *repo id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained pipeline
https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like hosted on the Hub.
`CompVis/ldm-text2im-large-256`. - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
- A path to a *directory* containing pipeline weights saved using using [`~FlaxDiffusionPipeline.save_pretrained`].
[`~FlaxDiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`.
dtype (`str` or `jnp.dtype`, *optional*): dtype (`str` or `jnp.dtype`, *optional*):
Override the default `jnp.dtype` and load the model under this dtype. If `"auto"` is passed the dtype Override the default `jnp.dtype` and load the model under this dtype. If `"auto"`, the dtype is
will be automatically derived from the model's weights. automatically derived from the model's weights.
force_download (`bool`, *optional*, defaults to `False`): force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist. cached versions if they exist.
resume_download (`bool`, *optional*, defaults to `False`): resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
file exists. incompletely downloaded files are deleted.
proxies (`Dict[str, str]`, *optional*): proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(`bool`, *optional*, defaults to `False`): output_loading_info(`bool`, *optional*, defaults to `False`):
Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(`bool`, *optional*, defaults to `False`): local_files_only (`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (i.e., do not try to download the model). Whether to only load local model weights and configuration files or not. If set to `True`, the model
won't be downloaded from the Hub.
use_auth_token (`str` or *bool*, *optional*): use_auth_token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
when running `huggingface-cli login` (stored in `~/.huggingface`). `diffusers-cli login` (stored in `~/.huggingface`) is used.
revision (`str`, *optional*, defaults to `"main"`): revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any allowed by Git.
identifier allowed by git.
mirror (`str`, *optional*): mirror (`str`, *optional*):
Mirror source to accelerate downloads in China. If you are from China and have an accessibility Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
Please refer to the mirror site for more information. specify the folder name here. information.
kwargs (remaining dictionary of keyword arguments, *optional*): kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the Can be used to overwrite load and saveable variables (the pipeline components) of the specific pipeline
specific pipeline class. The overwritten components are then directly passed to the pipelines class. The overwritten components are passed directly to the pipelines `__init__` method.
`__init__` method. See example below for more information.
<Tip>
It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
</Tip>
<Tip> <Tip>
Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
this method in a firewalled environment. `huggingface-cli login`. You can also activate the special
[“offline-mode”](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
firewalled environment.
</Tip> </Tip>
...@@ -540,7 +529,7 @@ class FlaxDiffusionPipeline(ConfigMixin): ...@@ -540,7 +529,7 @@ class FlaxDiffusionPipeline(ConfigMixin):
@staticmethod @staticmethod
def numpy_to_pil(images): def numpy_to_pil(images):
""" """
Convert a numpy image or a batch of images to a PIL image. Convert a NumPy image or a batch of images to a PIL image.
""" """
if images.ndim == 3: if images.ndim == 3:
images = images[None, ...] images = images[None, ...]
......
...@@ -463,13 +463,13 @@ class DiffusionPipeline(ConfigMixin): ...@@ -463,13 +463,13 @@ class DiffusionPipeline(ConfigMixin):
provides methods for loading, downloading and saving models. It also includes methods to: provides methods for loading, downloading and saving models. It also includes methods to:
- move all PyTorch modules to the device of your choice - move all PyTorch modules to the device of your choice
- enabling/disabling the progress bar for the denoising iteration - enable/disable the progress bar for the denoising iteration
Class attributes: Class attributes:
- **config_name** (`str`) -- The configuration filename that stores the class and module names of all the - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
diffusion pipeline's components. diffusion pipeline's components.
- **_optional_components** (List[`str`]) -- List of all optional components that don't have to be passed to the - **_optional_components** (`List[str]`) -- List of all optional components that don't have to be passed to the
pipeline to function (should be overridden by subclasses). pipeline to function (should be overridden by subclasses).
""" """
config_name = "model_index.json" config_name = "model_index.json"
...@@ -1475,10 +1475,9 @@ class DiffusionPipeline(ConfigMixin): ...@@ -1475,10 +1475,9 @@ class DiffusionPipeline(ConfigMixin):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
r""" r"""
Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). When this
option is enabled, you should observe lower GPU memory usage and a potential speed up during inference. Speed
When this option is enabled, you should observe lower GPU memory usage and a potential speed up during up during training is not guaranteed.
inference. Speed up during training is not guaranteed.
<Tip warning={true}> <Tip warning={true}>
...@@ -1537,10 +1536,9 @@ class DiffusionPipeline(ConfigMixin): ...@@ -1537,10 +1536,9 @@ class DiffusionPipeline(ConfigMixin):
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
r""" r"""
Enable sliced attention computation. Enable sliced attention computation. When this option is enabled, the attention module splits the input tensor
in slices to compute attention in several steps. This is useful to save some memory in exchange for a small
When this option is enabled, the attention module splits the input tensor in slices to compute attention in speed decrease.
several steps. This is useful to save some memory in exchange for a small speed decrease.
Args: Args:
slice_size (`str` or `int`, *optional*, defaults to `"auto"`): slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
......
...@@ -25,13 +25,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput ...@@ -25,13 +25,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class PNDMPipeline(DiffusionPipeline): class PNDMPipeline(DiffusionPipeline):
r""" r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the Pipeline for unconditional image generation.
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latents. unet ([`UNet2DModel`]):
scheduler ([`SchedulerMixin`]): A `UNet2DModel` to denoise the encoded image latents.
The `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image. scheduler ([`PNDMScheduler`]):
A `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
""" """
unet: UNet2DModel unet: UNet2DModel
...@@ -55,22 +58,41 @@ class PNDMPipeline(DiffusionPipeline): ...@@ -55,22 +58,41 @@ class PNDMPipeline(DiffusionPipeline):
**kwargs, **kwargs,
) -> Union[ImagePipelineOutput, Tuple]: ) -> Union[ImagePipelineOutput, Tuple]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
batch_size (`int`, `optional`, defaults to 1): The number of images to generate. batch_size (`int`, `optional`, defaults to 1):
The number of images to generate.
num_inference_steps (`int`, `optional`, defaults to 50): num_inference_steps (`int`, `optional`, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
generator (`torch.Generator`, `optional`): A [torch generator (`torch.Generator`, `optional`):
generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
deterministic. generation deterministic.
output_type (`str`, `optional`, defaults to `"pil"`): The output format of the generate image. Choose output_type (`str`, `optional`, defaults to `"pil"`):
between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, `optional`, defaults to `True`): Whether or not to return a return_dict (`bool`, *optional*, defaults to `True`):
[`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import PNDMPipeline
>>> # load model and scheduler
>>> pndm = PNDMPipeline.from_pretrained("google/ddpm-cifar10-32")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pndm().images[0]
>>> # save image
>>> image.save("pndm_generated_image.png")
```
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
""" """
# For more information on the sampling method you can take a look at Algorithm 2 of # For more information on the sampling method you can take a look at Algorithm 2 of
# the official paper: https://arxiv.org/pdf/2202.09778.pdf # the official paper: https://arxiv.org/pdf/2202.09778.pdf
......
...@@ -77,6 +77,19 @@ def _preprocess_mask(mask: Union[List, PIL.Image.Image, torch.Tensor]): ...@@ -77,6 +77,19 @@ def _preprocess_mask(mask: Union[List, PIL.Image.Image, torch.Tensor]):
class RePaintPipeline(DiffusionPipeline): class RePaintPipeline(DiffusionPipeline):
r"""
Pipeline for image inpainting using RePaint.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`RePaintScheduler`]):
A `RePaintScheduler` to be used in combination with `unet` to denoise the encoded image.
"""
unet: UNet2DModel unet: UNet2DModel
scheduler: RePaintScheduler scheduler: RePaintScheduler
...@@ -98,35 +111,77 @@ class RePaintPipeline(DiffusionPipeline): ...@@ -98,35 +111,77 @@ class RePaintPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]: ) -> Union[ImagePipelineOutput, Tuple]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
image (`torch.FloatTensor` or `PIL.Image.Image`): image (`torch.FloatTensor` or `PIL.Image.Image`):
The original image to inpaint on. The original image to inpaint on.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`): mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
The mask_image where 0.0 values define which part of the original image to inpaint (change). The mask_image where 0.0 define which part of the original image to inpaint.
num_inference_steps (`int`, *optional*, defaults to 1000): num_inference_steps (`int`, *optional*, defaults to 1000):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
eta (`float`): eta (`float`):
The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 - 0.0 is DDIM The weight of the added noise in a diffusion step. Its value is between 0.0 and 1.0; 0.0 corresponds to
and 1.0 is DDPM scheduler respectively. DDIM and 1.0 is the DDPM scheduler.
jump_length (`int`, *optional*, defaults to 10): jump_length (`int`, *optional*, defaults to 10):
The number of steps taken forward in time before going backward in time for a single jump ("j" in The number of steps taken forward in time before going backward in time for a single jump ("j" in
RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf. RePaint paper). Take a look at Figure 9 and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
jump_n_sample (`int`, *optional*, defaults to 10): jump_n_sample (`int`, *optional*, defaults to 10):
The number of times we will make forward time jump for a given chosen time sample. Take a look at The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf. and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, `optional`, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from io import BytesIO
>>> import torch
>>> import PIL
>>> import requests
>>> from diffusers import RePaintPipeline, RePaintScheduler
>>> def download_image(url):
... response = requests.get(url)
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
>>> img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
>>> mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
>>> # Load the original image and the mask as PIL images
>>> original_image = download_image(img_url).resize((256, 256))
>>> mask_image = download_image(mask_url).resize((256, 256))
>>> # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
>>> scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
>>> pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
>>> pipe = pipe.to("cuda")
>>> generator = torch.Generator(device="cuda").manual_seed(0)
>>> output = pipe(
... image=original_image,
... mask_image=mask_image,
... num_inference_steps=250,
... eta=0.0,
... jump_length=10,
... jump_n_sample=10,
... generator=generator,
... )
>>> inpainted_image = output.images[0]
```
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
""" """
original_image = image original_image = image
......
...@@ -24,11 +24,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput ...@@ -24,11 +24,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class ScoreSdeVePipeline(DiffusionPipeline): class ScoreSdeVePipeline(DiffusionPipeline):
r""" r"""
Pipeline for unconditional image generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters: Parameters:
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the unet ([`UNet2DModel`]):
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) A `UNet2DModel` to denoise the encoded image.
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. scheduler ([`SchedulerMixin`]): scheduler ([`ScoreSdeVeScheduler`]):
The [`ScoreSdeVeScheduler`] scheduler to be used in combination with `unet` to denoise the encoded image. A `ScoreSdeVeScheduler` to be used in combination with `unet` to denoise the encoded image.
""" """
unet: UNet2DModel unet: UNet2DModel
scheduler: ScoreSdeVeScheduler scheduler: ScoreSdeVeScheduler
...@@ -48,21 +53,23 @@ class ScoreSdeVePipeline(DiffusionPipeline): ...@@ -48,21 +53,23 @@ class ScoreSdeVePipeline(DiffusionPipeline):
**kwargs, **kwargs,
) -> Union[ImagePipelineOutput, Tuple]: ) -> Union[ImagePipelineOutput, Tuple]:
r""" r"""
The call function to the pipeline for generation.
Args: Args:
batch_size (`int`, *optional*, defaults to 1): batch_size (`int`, *optional*, defaults to 1):
The number of images to generate. The number of images to generate.
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, `optional`):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, `optional`, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Returns: Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is [`~pipelines.ImagePipelineOutput`] or `tuple`:
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
""" """
img_size = self.unet.config.sample_size img_size = self.unet.config.sample_size
......
...@@ -16,11 +16,11 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput): ...@@ -16,11 +16,11 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput):
Args: Args:
images (`List[PIL.Image.Image]` or `np.ndarray`) images (`List[PIL.Image.Image]` or `np.ndarray`)
List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. num_channels)`.
nsfw_content_detected (`List[bool]`) nsfw_content_detected (`List[bool]`)
List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" List indicating whether the corresponding generated image contains not-safe-for-work” (nsfw) content or
(nsfw) content, or `None` if safety checking could not be performed. `None` if safety checking could not be performed.
""" """
images: Union[List[PIL.Image.Image], np.ndarray] images: Union[List[PIL.Image.Image], np.ndarray]
......
...@@ -16,78 +16,34 @@ from . import SemanticStableDiffusionPipelineOutput ...@@ -16,78 +16,34 @@ from . import SemanticStableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
EXAMPLE_DOC_STRING = """
Examples:
```py
>>> import torch
>>> from diffusers import SemanticStableDiffusionPipeline
>>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
... )
>>> pipe = pipe.to("cuda")
>>> out = pipe(
... prompt="a photo of the face of a woman",
... num_images_per_prompt=1,
... guidance_scale=7,
... editing_prompt=[
... "smiling, smile", # Concepts to apply
... "glasses, wearing glasses",
... "curls, wavy hair, curly hair",
... "beard, full beard, mustache",
... ],
... reverse_editing_direction=[
... False,
... False,
... False,
... False,
... ], # Direction of guidance i.e. increase all concepts
... edit_warmup_steps=[10, 10, 10, 10], # Warmup period for each concept
... edit_guidance_scale=[4, 5, 5, 5.4], # Guidance scale for each concept
... edit_threshold=[
... 0.99,
... 0.975,
... 0.925,
... 0.96,
... ], # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
... edit_momentum_scale=0.3, # Momentum scale that will be added to the latent guidance
... edit_mom_beta=0.6, # Momentum beta
... edit_weights=[1, 1, 1, 1, 1], # Weights of the individual concepts against each other
... )
>>> image = out.images[0]
```
"""
class SemanticStableDiffusionPipeline(DiffusionPipeline): class SemanticStableDiffusionPipeline(DiffusionPipeline):
r""" r"""
Pipeline for text-to-image generation with latent editing. Pipeline for text-to-image generation using Stable Diffusion with latent editing.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionPipeline`]. Check the superclass
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
device, etc.).
This model builds on the implementation of ['StableDiffusionPipeline']
Args: Args:
vae ([`AutoencoderKL`]): vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
text_encoder ([`CLIPTextModel`]): text_encoder ([`~transformers.CLIPTextModel`]):
Frozen text-encoder. Stable Diffusion uses the text portion of Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically tokenizer ([`~transformers.CLIPTokenizer`]):
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. A `CLIPTokenizer` to tokenize text.
tokenizer (`CLIPTokenizer`): unet ([`UNet2DConditionModel`]):
Tokenizer of class A `UNet2DConditionModel` to denoise the encoded image latents.
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
safety_checker ([`Q16SafetyChecker`]): safety_checker ([`Q16SafetyChecker`]):
Classification module that estimates whether generated images could be considered offensive or harmful. Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
feature_extractor ([`CLIPImageProcessor`]): about a model's potential harms.
Model that extracts features from generated images to be used as inputs for the `safety_checker`. feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
_optional_components = ["safety_checker", "feature_extractor"] _optional_components = ["safety_checker", "feature_extractor"]
...@@ -277,97 +233,130 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline): ...@@ -277,97 +233,130 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
sem_guidance: Optional[List[torch.Tensor]] = None, sem_guidance: Optional[List[torch.Tensor]] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. The call function to the pipeline for generation.
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide image generation.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The height in pixels of the generated image. The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image. The width in pixels of the generated image.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
guidance_scale (`float`, *optional*, defaults to 7.5): guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). A higher guidance scale value encourages the model to generate images closely linked to the text
`guidance_scale` is defined as `w` of equation 2. of [Imagen `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
negative_prompt (`str` or `List[str]`, *optional*): negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored The prompt or prompts to guide what to not include in image generation. If not defined, you need to
if `guidance_scale` is less than `1`). pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0): eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
[`schedulers.DDIMScheduler`], will be ignored for others. to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`. tensor is generated by sampling using the supplied random `generator`.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
plain tuple. plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. The function will be A function that calls every `callback_steps` steps during inference. The function is called with the
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be The frequency at which the `callback` function is called. If not specified, the callback is called at
called at every step. every step.
editing_prompt (`str` or `List[str]`, *optional*): editing_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to use for Semantic guidance. Semantic guidance is disabled by setting The prompt or prompts to use for semantic guidance. Semantic guidance is disabled by setting
`editing_prompt = None`. Guidance direction of prompt should be specified via `editing_prompt = None`. Guidance direction of prompt should be specified via
`reverse_editing_direction`. `reverse_editing_direction`.
editing_prompt_embeddings (`torch.Tensor>`, *optional*): editing_prompt_embeddings (`torch.Tensor`, *optional*):
Pre-computed embeddings to use for semantic guidance. Guidance direction of embedding should be Pre-computed embeddings to use for semantic guidance. Guidance direction of embedding should be
specified via `reverse_editing_direction`. specified via `reverse_editing_direction`.
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
Whether the corresponding prompt in `editing_prompt` should be increased or decreased. Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5): edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
Guidance scale for semantic guidance. If provided as list values should correspond to `editing_prompt`. Guidance scale for semantic guidance. If provided as a list, values should correspond to
`edit_guidance_scale` is defined as `s_e` of equation 6 of [SEGA `editing_prompt`.
Paper](https://arxiv.org/pdf/2301.12247.pdf).
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10): edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
Number of diffusion steps (for each prompt) for which semantic guidance will not be applied. Momentum Number of diffusion steps (for each prompt) for which semantic guidance is not applied. Momentum is
will still be calculated for those steps and applied once all warmup periods are over. calculated for those steps and applied once all warmup periods are over.
`edit_warmup_steps` is defined as `delta` (δ) of [SEGA Paper](https://arxiv.org/pdf/2301.12247.pdf).
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`): edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
Number of diffusion steps (for each prompt) after which semantic guidance will no longer be applied. Number of diffusion steps (for each prompt) after which semantic guidance is longer applied.
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9): edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
Threshold of semantic guidance. Threshold of semantic guidance.
edit_momentum_scale (`float`, *optional*, defaults to 0.1): edit_momentum_scale (`float`, *optional*, defaults to 0.1):
Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0 Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0,
momentum will be disabled. Momentum is already built up during warmup, i.e. for diffusion steps smaller momentum is disabled. Momentum is already built up during warmup (for diffusion steps smaller than
than `sld_warmup_steps`. Momentum will only be added to latent guidance once all warmup periods are `sld_warmup_steps`). Momentum is only added to latent guidance once all warmup periods are finished.
finished. `edit_momentum_scale` is defined as `s_m` of equation 7 of [SEGA
Paper](https://arxiv.org/pdf/2301.12247.pdf).
edit_mom_beta (`float`, *optional*, defaults to 0.4): edit_mom_beta (`float`, *optional*, defaults to 0.4):
Defines how semantic guidance momentum builds up. `edit_mom_beta` indicates how much of the previous Defines how semantic guidance momentum builds up. `edit_mom_beta` indicates how much of the previous
momentum will be kept. Momentum is already built up during warmup, i.e. for diffusion steps smaller momentum is kept. Momentum is already built up during warmup (for diffusion steps smaller than
than `edit_warmup_steps`. `edit_mom_beta` is defined as `beta_m` (β) of equation 8 of [SEGA `edit_warmup_steps`).
Paper](https://arxiv.org/pdf/2301.12247.pdf).
edit_weights (`List[float]`, *optional*, defaults to `None`): edit_weights (`List[float]`, *optional*, defaults to `None`):
Indicates how much each individual concept should influence the overall guidance. If no weights are Indicates how much each individual concept should influence the overall guidance. If no weights are
provided all concepts are applied equally. `edit_mom_beta` is defined as `g_i` of equation 9 of [SEGA provided all concepts are applied equally.
Paper](https://arxiv.org/pdf/2301.12247.pdf).
sem_guidance (`List[torch.Tensor]`, *optional*): sem_guidance (`List[torch.Tensor]`, *optional*):
List of pre-generated guidance vectors to be applied at generation. Length of the list has to List of pre-generated guidance vectors to be applied at generation. Length of the list has to
correspond to `num_inference_steps`. correspond to `num_inference_steps`.
Examples:
```py
>>> import torch
>>> from diffusers import SemanticStableDiffusionPipeline
>>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
... )
>>> pipe = pipe.to("cuda")
>>> out = pipe(
... prompt="a photo of the face of a woman",
... num_images_per_prompt=1,
... guidance_scale=7,
... editing_prompt=[
... "smiling, smile", # Concepts to apply
... "glasses, wearing glasses",
... "curls, wavy hair, curly hair",
... "beard, full beard, mustache",
... ],
... reverse_editing_direction=[
... False,
... False,
... False,
... False,
... ], # Direction of guidance i.e. increase all concepts
... edit_warmup_steps=[10, 10, 10, 10], # Warmup period for each concept
... edit_guidance_scale=[4, 5, 5, 5.4], # Guidance scale for each concept
... edit_threshold=[
... 0.99,
... 0.975,
... 0.925,
... 0.96,
... ], # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
... edit_momentum_scale=0.3, # Momentum scale that will be added to the latent guidance
... edit_mom_beta=0.6, # Momentum beta
... edit_weights=[1, 1, 1, 1, 1], # Weights of the individual concepts against each other
... )
>>> image = out.images[0]
```
Returns: Returns:
[`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] or `tuple`: [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] if `return_dict` is True, If `return_dict` is `True`,
otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] is returned, otherwise a
second element is a list of `bool`s denoting whether the corresponding generated image likely represents `tuple` is returned where the first element is a list with the generated images and the second element
"not-safe-for-work" (nsfw) content, according to the `safety_checker`. is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work"
(nsfw) content.
""" """
# 0. Default height and width to unet # 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor height = height or self.unet.config.sample_size * self.vae_scale_factor
......
...@@ -68,11 +68,11 @@ EXAMPLE_DOC_STRING = """ ...@@ -68,11 +68,11 @@ EXAMPLE_DOC_STRING = """
@dataclass @dataclass
class ShapEPipelineOutput(BaseOutput): class ShapEPipelineOutput(BaseOutput):
""" """
Output class for ShapEPipeline. Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
Args: Args:
images (`torch.FloatTensor`) images (`torch.FloatTensor`)
a list of images for 3D rendering A list of images for 3D rendering.
""" """
images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]] images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]]
...@@ -80,10 +80,10 @@ class ShapEPipelineOutput(BaseOutput): ...@@ -80,10 +80,10 @@ class ShapEPipelineOutput(BaseOutput):
class ShapEPipeline(DiffusionPipeline): class ShapEPipeline(DiffusionPipeline):
""" """
Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args: Args:
prior ([`PriorTransformer`]): prior ([`PriorTransformer`]):
...@@ -91,13 +91,12 @@ class ShapEPipeline(DiffusionPipeline): ...@@ -91,13 +91,12 @@ class ShapEPipeline(DiffusionPipeline):
text_encoder ([`CLIPTextModelWithProjection`]): text_encoder ([`CLIPTextModelWithProjection`]):
Frozen text-encoder. Frozen text-encoder.
tokenizer (`CLIPTokenizer`): tokenizer (`CLIPTokenizer`):
Tokenizer of class A [`~transformers.CLIPTokenizer`] to tokenize text.
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
scheduler ([`HeunDiscreteScheduler`]): scheduler ([`HeunDiscreteScheduler`]):
A scheduler to be used in combination with `prior` to generate image embedding. A scheduler to be used in combination with `prior` to generate image embedding.
shap_e_renderer ([`ShapERenderer`]): shap_e_renderer ([`ShapERenderer`]):
Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
with the NeRF rendering method with the NeRF rendering method.
""" """
def __init__( def __init__(
...@@ -132,10 +131,10 @@ class ShapEPipeline(DiffusionPipeline): ...@@ -132,10 +131,10 @@ class ShapEPipeline(DiffusionPipeline):
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. iterative execution of the `unet`.
""" """
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook from accelerate import cpu_offload_with_hook
...@@ -222,7 +221,7 @@ class ShapEPipeline(DiffusionPipeline): ...@@ -222,7 +221,7 @@ class ShapEPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
): ):
""" """
Function invoked when calling the pipeline for generation. The call function to the pipeline for generation.
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
...@@ -233,30 +232,31 @@ class ShapEPipeline(DiffusionPipeline): ...@@ -233,30 +232,31 @@ class ShapEPipeline(DiffusionPipeline):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`. tensor is generated by sampling using the supplied random `generator`.
guidance_scale (`float`, *optional*, defaults to 4.0): guidance_scale (`float`, *optional*, defaults to 4.0):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). A higher guidance scale value encourages the model to generate images closely linked to the text
`guidance_scale` is defined as `w` of equation 2. of [Imagen `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality. usually at the expense of lower image quality.
frame_size (`int`, *optional*, default to 64): frame_size (`int`, *optional*, default to 64):
the width and height of each image frame of the generated 3d output The width and height of each image frame of the generated 3D output.
output_type (`str`, *optional*, defaults to `"pt"`): output_type (`str`, *optional*, defaults to `"pt"`):
The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
(`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]). (`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]).
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
tuple.
Examples: Examples:
Returns: Returns:
[`ShapEPipelineOutput`] or `tuple` [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
otherwise a `tuple` is returned where the first element is a list with the generated images.
""" """
if isinstance(prompt, str): if isinstance(prompt, str):
......
...@@ -67,11 +67,11 @@ EXAMPLE_DOC_STRING = """ ...@@ -67,11 +67,11 @@ EXAMPLE_DOC_STRING = """
@dataclass @dataclass
class ShapEPipelineOutput(BaseOutput): class ShapEPipelineOutput(BaseOutput):
""" """
Output class for ShapEPipeline. Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
Args: Args:
images (`torch.FloatTensor`) images (`torch.FloatTensor`)
a list of images for 3D rendering A list of images for 3D rendering.
""" """
images: Union[PIL.Image.Image, np.ndarray] images: Union[PIL.Image.Image, np.ndarray]
...@@ -79,24 +79,24 @@ class ShapEPipelineOutput(BaseOutput): ...@@ -79,24 +79,24 @@ class ShapEPipelineOutput(BaseOutput):
class ShapEImg2ImgPipeline(DiffusionPipeline): class ShapEImg2ImgPipeline(DiffusionPipeline):
""" """
Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E from an
image.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args: Args:
prior ([`PriorTransformer`]): prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding. The canonincal unCLIP prior to approximate the image embedding from the text embedding.
text_encoder ([`CLIPTextModelWithProjection`]): image_encoder ([`CLIPVisionModel`]):
Frozen text-encoder. Frozen image-encoder.
tokenizer (`CLIPTokenizer`): image_processor (`CLIPImageProcessor`):
Tokenizer of class A [`~transformers.CLIPImageProcessor`] to process images.
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
scheduler ([`HeunDiscreteScheduler`]): scheduler ([`HeunDiscreteScheduler`]):
A scheduler to be used in combination with `prior` to generate image embedding. A scheduler to be used in combination with `prior` to generate image embedding.
shap_e_renderer ([`ShapERenderer`]): shap_e_renderer ([`ShapERenderer`]):
Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
with the NeRF rendering method with the NeRF rendering method.
""" """
def __init__( def __init__(
...@@ -174,40 +174,41 @@ class ShapEImg2ImgPipeline(DiffusionPipeline): ...@@ -174,40 +174,41 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True, return_dict: bool = True,
): ):
""" """
Function invoked when calling the pipeline for generation. The call function to the pipeline for generation.
Args: Args:
prompt (`str` or `List[str]`): image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
The prompt or prompts to guide the image generation. `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
latents as `image`, if passing latents directly, it will not be encoded again.
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
num_inference_steps (`int`, *optional*, defaults to 100): num_inference_steps (`int`, *optional*, defaults to 100):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`. tensor is generated by sampling using the supplied random `generator`.
guidance_scale (`float`, *optional*, defaults to 4.0): guidance_scale (`float`, *optional*, defaults to 4.0):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). A higher guidance scale value encourages the model to generate images closely linked to the text
`guidance_scale` is defined as `w` of equation 2. of [Imagen `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
frame_size (`int`, *optional*, default to 64): frame_size (`int`, *optional*, default to 64):
the width and height of each image frame of the generated 3d output The width and height of each image frame of the generated 3D output.
output_type (`str`, *optional*, defaults to `"pt"`): output_type (`str`, *optional*, defaults to `"pt"`):
(`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]). (`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]).
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
tuple.
Examples: Examples:
Returns: Returns:
[`ShapEPipelineOutput`] or `tuple` [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
otherwise a `tuple` is returned where the first element is a list with the generated images.
""" """
if isinstance(image, PIL.Image.Image): if isinstance(image, PIL.Image.Image):
......
...@@ -38,6 +38,21 @@ TARGET_FEATURE_LENGTH = 256 ...@@ -38,6 +38,21 @@ TARGET_FEATURE_LENGTH = 256
class SpectrogramDiffusionPipeline(DiffusionPipeline): class SpectrogramDiffusionPipeline(DiffusionPipeline):
r"""
Pipeline for unconditional audio generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
notes_encoder ([`SpectrogramNotesEncoder`]):
continuous_encoder ([`SpectrogramContEncoder`]):
decoder ([`T5FilmDecoder`]):
A [`T5FilmDecoder`] to denoise the encoded audio latents.
scheduler ([`DDPMScheduler`]):
A scheduler to be used in combination with `decoder` to denoise the encoded audio latents.
melgan ([`OnnxRuntimeModel`]):
"""
_optional_components = ["melgan"] _optional_components = ["melgan"]
def __init__( def __init__(
...@@ -127,6 +142,48 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline): ...@@ -127,6 +142,48 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):
f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
f" {type(callback_steps)}." f" {type(callback_steps)}."
) )
r"""
The call function to the pipeline for generation.
Args:
input_tokens (`List[List[int]]`):
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 100):
The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
expense of slower inference.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
output_type (`str`, *optional*, defaults to `"numpy"`):
The output format of the generated audio.
callback (`Callable`, *optional*):
A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function is called. If not specified, the callback is called at
every step.
Example:
```py
>>> from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
>>> pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
>>> pipe = pipe.to("cuda")
>>> processor = MidiProcessor()
>>> # Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
>>> output = pipe(processor("beethoven_hammerklavier_2.mid"))
>>> audio = output.audios[0]
```
Returns:
[`pipelines.AudioPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated audio.
"""
pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32) pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32) full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
......
...@@ -25,11 +25,11 @@ class StableDiffusionPipelineOutput(BaseOutput): ...@@ -25,11 +25,11 @@ class StableDiffusionPipelineOutput(BaseOutput):
Args: Args:
images (`List[PIL.Image.Image]` or `np.ndarray`) images (`List[PIL.Image.Image]` or `np.ndarray`)
List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. num_channels)`.
nsfw_content_detected (`List[bool]`) nsfw_content_detected (`List[bool]`)
List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
(nsfw) content, or `None` if safety checking could not be performed. `None` if safety checking could not be performed.
""" """
images: Union[List[PIL.Image.Image], np.ndarray] images: Union[List[PIL.Image.Image], np.ndarray]
...@@ -116,14 +116,14 @@ if is_transformers_available() and is_flax_available(): ...@@ -116,14 +116,14 @@ if is_transformers_available() and is_flax_available():
@flax.struct.dataclass @flax.struct.dataclass
class FlaxStableDiffusionPipelineOutput(BaseOutput): class FlaxStableDiffusionPipelineOutput(BaseOutput):
""" """
Output class for Stable Diffusion pipelines. Output class for Flax-based Stable Diffusion pipelines.
Args: Args:
images (`np.ndarray`) images (`np.ndarray`):
Array of shape `(batch_size, height, width, num_channels)` with images from the diffusion pipeline. Denoised images of array shape of `(batch_size, height, width, num_channels)`.
nsfw_content_detected (`List[bool]`) nsfw_content_detected (`List[bool]`):
List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
(nsfw) content. or `None` if safety checking could not be performed.
""" """
images: np.ndarray images: np.ndarray
......
...@@ -130,28 +130,27 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -130,28 +130,27 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
r""" r"""
Pipeline for text-guided image to image generation using Stable Diffusion. Pipeline for text-guided image to image generation using Stable Diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args: Args:
vae ([`AutoencoderKL`]): vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
text_encoder ([`CLIPTextModel`]): text_encoder ([`~transformers.CLIPTextModel`]):
Frozen text-encoder. Stable Diffusion uses the text portion of Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically tokenizer ([`~transformers.CLIPTokenizer`]):
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. A `CLIPTokenizer` to tokenize text.
tokenizer (`CLIPTokenizer`): unet ([`UNet2DConditionModel`]):
Tokenizer of class A `UNet2DConditionModel` to denoise the encoded image latents.
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can only be an
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. instance of [`DDIMScheduler`].
safety_checker ([`StableDiffusionSafetyChecker`]): safety_checker ([`StableDiffusionSafetyChecker`]):
Classification module that estimates whether generated images could be considered offensive or harmful. Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
feature_extractor ([`CLIPImageProcessor`]): about a model's potential harms.
Model that extracts features from generated images to be used as inputs for the `safety_checker`. feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
_optional_components = ["safety_checker", "feature_extractor"] _optional_components = ["safety_checker", "feature_extractor"]
...@@ -234,10 +233,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -234,10 +233,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. iterative execution of the `unet`.
""" """
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook from accelerate import cpu_offload_with_hook
...@@ -595,71 +594,134 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -595,71 +594,134 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. The call function to the pipeline for generation.
Args: Args:
prompt (`str` or `List[str]`): prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation. The prompt or prompts to guide the image generation.
image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded latents as `image`, but if passing latents directly it is not encoded again.
again.
strength (`float`, *optional*, defaults to 0.8): strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
will be used as a starting point, adding more noise to it the larger the `strength`. The number of starting point and more noise is added the higher the `strength`. The number of denoising steps depends
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
be maximum and the denoising process will run for the full number of iterations specified in process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`. essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`. expense of slower inference. This parameter is modulated by `strength`.
guidance_scale (`float`, *optional*, defaults to 7.5): guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). A higher guidance scale value encourages the model to generate images closely linked to the text
`guidance_scale` is defined as `w` of equation 2. of [Imagen `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
source_guidance_scale (`float`, *optional*, defaults to 1): source_guidance_scale (`float`, *optional*, defaults to 1):
Guidance scale for the source prompt. This is useful to control the amount of influence the source Guidance scale for the source prompt. This is useful to control the amount of influence the source
prompt for encoding. prompt has for encoding.
num_images_per_prompt (`int`, *optional*, defaults to 1): num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt. The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.1): eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
[`schedulers.DDIMScheduler`], will be ignored for others. to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
to make generation deterministic. generation deterministic.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings will be generated from `prompt` input argument. provided, text embeddings are generated from the `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between The output format of the generated image. Choose between `PIL.Image` or `np.array`.
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
plain tuple. plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. The function will be A function that calls every `callback_steps` steps during inference. The function is called with the
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be The frequency at which the `callback` function is called. If not specified, the callback is called at
called at every step. every step.
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
`self.processor` in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
Example:
```py
import requests
import torch
from PIL import Image
from io import BytesIO
from diffusers import CycleDiffusionPipeline, DDIMScheduler
# load the pipeline
# make sure you're logged in with `huggingface-cli login`
model_id_or_path = "CompVis/stable-diffusion-v1-4"
scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
# let's download an initial image
url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
init_image.save("horse.png")
# let's specify a prompt
source_prompt = "An astronaut riding a horse"
prompt = "An astronaut riding an elephant"
# call the pipeline
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.8,
guidance_scale=2,
source_guidance_scale=1,
).images[0]
image.save("horse_to_elephant.png")
# let's try another example
# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
url = (
"https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
)
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
init_image.save("black.png")
source_prompt = "A black colored car"
prompt = "A blue colored car"
# call the pipeline
torch.manual_seed(0)
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.85,
guidance_scale=3,
source_guidance_scale=1,
).images[0]
image.save("black_to_blue.png")
```
Returns: Returns:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
When returning a tuple, the first element is a list with the generated images, and the second element is a otherwise a `tuple` is returned where the first element is a list with the generated images and the
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" second element is a list of `bool`s indicating whether the corresponding generated image contains
(nsfw) content, according to the `safety_checker`. "not-safe-for-work" (nsfw) content.
""" """
# 1. Check inputs # 1. Check inputs
self.check_inputs(prompt, strength, callback_steps) self.check_inputs(prompt, strength, callback_steps)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment