Unverified Commit a69754bb authored by Steven Liu's avatar Steven Liu Committed by GitHub
Browse files

[docs] Clean up pipeline apis (#3905)

* start with stable diffusion

* fix

* finish stable diffusion pipelines

* fix path to pipeline output

* fix flax paths

* fix copies

* add up to score sde ve

* finish first pass of pipelines

* fix copies

* second review

* align doc titles

* more review fixes

* final review
parent bcc570b9
......@@ -26,13 +26,16 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
class DanceDiffusionPipeline(DiffusionPipeline):
r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Pipeline for audio generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded audio.
unet ([`UNet1DModel`]):
A `UNet1DModel` to denoise the encoded audio.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded audio. Can be one of
A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
[`IPNDMScheduler`].
"""
......@@ -50,6 +53,8 @@ class DanceDiffusionPipeline(DiffusionPipeline):
return_dict: bool = True,
) -> Union[AudioPipelineOutput, Tuple]:
r"""
The call function to the pipeline for generation.
Args:
batch_size (`int`, *optional*, defaults to 1):
The number of audio samples to generate.
......@@ -57,17 +62,40 @@ class DanceDiffusionPipeline(DiffusionPipeline):
The number of denoising steps. More denoising steps usually lead to a higher-quality audio sample at
the expense of slower inference.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.*
`sample_size`, will be `audio_length_in_s` * `self.unet.config.sample_rate`.
The length of the generated audio sample in seconds.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
Example:
```py
from diffusers import DiffusionPipeline
from scipy.io.wavfile import write
model_id = "harmonai/maestro-150k"
pipe = DiffusionPipeline.from_pretrained(model_id)
pipe = pipe.to("cuda")
audios = pipe(audio_length_in_s=4.0).audios
# To save locally
for i, audio in enumerate(audios):
write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
# To dislay in google colab
import IPython.display as ipd
for audio in audios:
display(ipd.Audio(audio, rate=pipe.unet.sample_rate))
```
Returns:
[`~pipelines.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if `return_dict` is
True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated audio.
[`~pipelines.AudioPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated audio.
"""
if audio_length_in_s is None:
......
......@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class DDIMPipeline(DiffusionPipeline):
r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Pipeline for image generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
[`DDPMScheduler`], or [`DDIMScheduler`].
......@@ -53,29 +56,56 @@ class DDIMPipeline(DiffusionPipeline):
return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]:
r"""
The call function to the pipeline for generation.
Args:
batch_size (`int`, *optional*, defaults to 1):
The number of images to generate.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
eta (`float`, *optional*, defaults to 0.0):
The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to
DDIM and `1` corresponds to DDPM.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
use_clipped_model_output (`bool`, *optional*, defaults to `None`):
if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed
downstream to the scheduler. So use `None` for schedulers which don't support this argument.
If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed
downstream to the scheduler (use `None` for schedulers which don't support this argument).
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import DDIMPipeline
>>> import PIL.Image
>>> import numpy as np
>>> # load model and scheduler
>>> pipe = DDIMPipeline.from_pretrained("fusing/ddim-lsun-bedroom")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pipe(eta=0.0, num_inference_steps=50)
>>> # process image to PIL
>>> image_processed = image.cpu().permute(0, 2, 3, 1)
>>> image_processed = (image_processed + 1.0) * 127.5
>>> image_processed = image_processed.numpy().astype(np.uint8)
>>> image_pil = PIL.Image.fromarray(image_processed[0])
>>> # save image
>>> image_pil.save("test.png")
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
"""
# Sample gaussian noise to begin loop
......
......@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class DDPMPipeline(DiffusionPipeline):
r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Pipeline for image generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
[`DDPMScheduler`], or [`DDIMScheduler`].
......@@ -47,24 +50,41 @@ class DDPMPipeline(DiffusionPipeline):
return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]:
r"""
The call function to the pipeline for generation.
Args:
batch_size (`int`, *optional*, defaults to 1):
The number of images to generate.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 1000):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import DDPMPipeline
>>> # load model and scheduler
>>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pipe().images[0]
>>> # save image
>>> image.save("ddpm_generated_image.png")
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
"""
# Sample gaussian noise to begin loop
if isinstance(self.unet.config.sample_size, int):
......
......@@ -30,16 +30,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class DiTPipeline(DiffusionPipeline):
r"""
This pipeline inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Pipeline for image generation based on a Transformer backbone instead of a UNet.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
transformer ([`Transformer2DModel`]):
Class conditioned Transformer in Diffusion model to denoise the encoded image latents.
A class conditioned `Transformer2DModel` to denoise the encoded image latents.
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
scheduler ([`DDIMScheduler`]):
A scheduler to be used in combination with `dit` to denoise the encoded image latents.
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
"""
def __init__(
......@@ -63,13 +65,15 @@ class DiTPipeline(DiffusionPipeline):
def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
r"""
Map label strings, *e.g.* from ImageNet, to corresponding class ids.
Map label strings from ImageNet to corresponding class ids.
Parameters:
label (`str` or `dict` of `str`): label strings to be mapped to class ids.
label (`str` or `dict` of `str`):
Label strings to be mapped to class ids.
Returns:
`list` of `int`: Class ids to be processed by pipeline.
`list` of `int`:
Class ids to be processed by pipeline.
"""
if not isinstance(label, list):
......@@ -94,24 +98,53 @@ class DiTPipeline(DiffusionPipeline):
return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]:
r"""
Function invoked when calling the pipeline for generation.
The call function to the pipeline for generation.
Args:
class_labels (List[int]):
List of imagenet class labels for the images to be generated.
List of ImageNet class labels for the images to be generated.
guidance_scale (`float`, *optional*, defaults to 4.0):
Scale of the guidance signal.
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
generator (`torch.Generator`, *optional*):
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 250):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Examples:
```py
>>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
>>> import torch
>>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
>>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
>>> pipe = pipe.to("cuda")
>>> # pick words from Imagenet class labels
>>> pipe.labels # to print all available words
>>> # pick words that exist in ImageNet
>>> words = ["white shark", "umbrella"]
>>> class_ids = pipe.get_label_ids(words)
>>> generator = torch.manual_seed(33)
>>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
>>> image = output.images[0] # label 'white shark'
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
"""
batch_size = len(class_labels)
......
......@@ -31,18 +31,20 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class LDMTextToImagePipeline(DiffusionPipeline):
r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Pipeline for text-to-image generation using latent diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
vqvae ([`VQModel`]):
Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
Vector-quantized (VQ) model to encode and decode images to and from latent representations.
bert ([`LDMBertModel`]):
Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture.
tokenizer (`transformers.BertTokenizer`):
Tokenizer of class
[BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
Text-encoder model based on [`~transformers.BERT`].
tokenizer ([`~transformers.BertTokenizer`]):
A `BertTokenizer` to tokenize text.
unet ([`UNet2DConditionModel`]):
A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
......@@ -76,38 +78,54 @@ class LDMTextToImagePipeline(DiffusionPipeline):
**kwargs,
) -> Union[Tuple, ImagePipelineOutput]:
r"""
The call function to the pipeline for generation.
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, *optional*, defaults to 1.0):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt` at
the, usually at the expense of lower image quality.
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
tensor is generated by sampling using the supplied random `generator`.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import DiffusionPipeline
>>> # load model and scheduler
>>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
>>> # run pipeline in inference (sample random noise and denoise)
>>> prompt = "A painting of a squirrel eating a burger"
>>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images
>>> # save images
>>> for idx, image in enumerate(images):
... image.save(f"squirrel-{idx}.png")
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
"""
# 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor
......
......@@ -31,15 +31,16 @@ def preprocess(image):
class LDMSuperResolutionPipeline(DiffusionPipeline):
r"""
A pipeline for image super-resolution using Latent
A pipeline for image super-resolution using latent diffusion.
This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
vqvae ([`VQModel`]):
Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations.
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
Vector-quantized (VQ) model to encode and decode images to and from latent representations.
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
......@@ -74,30 +75,58 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
return_dict: bool = True,
) -> Union[Tuple, ImagePipelineOutput]:
r"""
The call function to the pipeline for generation.
Args:
image (`torch.Tensor` or `PIL.Image.Image`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process.
`Image` or tensor representing an image batch to be used as the starting point for the process.
batch_size (`int`, *optional*, defaults to 1):
Number of images to generate.
num_inference_steps (`int`, *optional*, defaults to 100):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> import requests
>>> from PIL import Image
>>> from io import BytesIO
>>> from diffusers import LDMSuperResolutionPipeline
>>> import torch
>>> # load model and scheduler
>>> pipeline = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
>>> pipeline = pipeline.to("cuda")
>>> # let's download an image
>>> url = (
... "https://user-images.githubusercontent.com/38061659/199705896-b48e17b8-b231-47cd-a270-4ffa5a93fa3e.png"
... )
>>> response = requests.get(url)
>>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
>>> low_res_img = low_res_img.resize((128, 128))
>>> # run pipeline in inference (sample random noise and denoise)
>>> upscaled_image = pipeline(low_res_img, num_inference_steps=100, eta=1).images[0]
>>> # save image
>>> upscaled_image.save("ldm_generated_image.png")
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
"""
if isinstance(image, PIL.Image.Image):
batch_size = 1
......
......@@ -25,15 +25,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class LDMPipeline(DiffusionPipeline):
r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Pipeline for unconditional image generation using latent diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
vqvae ([`VQModel`]):
Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents.
Vector-quantized (VQ) model to encode and decode images to and from latent representations.
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
[`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents.
[`DDIMScheduler`] is used in combination with `unet` to denoise the encoded image latents.
"""
def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
......@@ -52,24 +55,38 @@ class LDMPipeline(DiffusionPipeline):
**kwargs,
) -> Union[Tuple, ImagePipelineOutput]:
r"""
The call function to the pipeline for generation.
Args:
batch_size (`int`, *optional*, defaults to 1):
Number of images to generate.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import LDMPipeline
>>> # load model and scheduler
>>> pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pipe().images[0]
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images
"""
latents = randn_tensor(
......
......@@ -136,28 +136,36 @@ def prepare_mask_and_masked_image(image, mask):
class PaintByExamplePipeline(DiffusionPipeline):
r"""
Pipeline for image-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
<Tip warning={true}>
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
🧪 This is an experimental feature!
</Tip>
Pipeline for image-guided image inpainting using Stable Diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
image_encoder ([`PaintByExampleImageEncoder`]):
Encodes the example input image. The unet is conditioned on the example image instead of a text prompt.
tokenizer (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
Encodes the example input image. The `unet` is conditioned on the example image instead of a text prompt.
tokenizer ([`~transformers.CLIPTokenizer`]):
A `CLIPTokenizer` to tokenize text.
unet ([`UNet2DConditionModel`]):
A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
safety_checker ([`StableDiffusionSafetyChecker`]):
Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
feature_extractor ([`CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
about a model's potential harms.
feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
"""
# TODO: feature_extractor is required to encode initial images (if they are in PIL format),
# we should give a descriptive message if the pipeline doesn't have one.
......@@ -378,66 +386,99 @@ class PaintByExamplePipeline(DiffusionPipeline):
callback_steps: int = 1,
):
r"""
Function invoked when calling the pipeline for generation.
The call function to the pipeline for generation.
Args:
example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
The exemplar image to guide the image generation.
An example image to guide image generation.
image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
`Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
be masked out with `mask_image` and repainted according to `prompt`.
`Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with
`mask_image` and repainted according to `prompt`).
mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
instead of 3, so the expected shape would be `(B, H, W, 1)`.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
`Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted,
while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
(luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
expected shape would be `(B, H, W, 1)`.
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
The prompt or prompts to guide what to not include in image generation. If not defined, you need to
pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
tensor is generated by sampling using the supplied random `generator`.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
plain tuple.
callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. The function will be
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
The frequency at which the `callback` function is called. If not specified, the callback is called at
every step.
Example:
```py
>>> import PIL
>>> import requests
>>> import torch
>>> from io import BytesIO
>>> from diffusers import PaintByExamplePipeline
>>> def download_image(url):
... response = requests.get(url)
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
>>> img_url = (
... "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
... )
>>> mask_url = (
... "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
... )
>>> example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
>>> init_image = download_image(img_url).resize((512, 512))
>>> mask_image = download_image(mask_url).resize((512, 512))
>>> example_image = download_image(example_url).resize((512, 512))
>>> pipe = PaintByExamplePipeline.from_pretrained(
... "Fantasy-Studio/Paint-by-Example",
... torch_dtype=torch.float16,
... )
>>> pipe = pipe.to("cuda")
>>> image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
>>> image
```
Returns:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
When returning a tuple, the first element is a list with the generated images, and the second element is a
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
otherwise a `tuple` is returned where the first element is a list with the generated images and the
second element is a list of `bool`s indicating whether the corresponding generated image contains
"not-safe-for-work" (nsfw) content.
"""
# 1. Define call parameters
if isinstance(image, PIL.Image.Image):
......
......@@ -92,18 +92,17 @@ class FlaxImagePipelineOutput(BaseOutput):
class FlaxDiffusionPipeline(ConfigMixin):
r"""
Base class for all models.
Base class for Flax-based pipelines.
[`FlaxDiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion
pipelines and handles methods for loading, downloading and saving models as well as a few methods common to all
pipelines to:
[`FlaxDiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and
provides methods for loading, downloading and saving models. It also includes methods to:
- enabling/disabling the progress bar for the denoising iteration
- enable/disable the progress bar for the denoising iteration
Class attributes:
- **config_name** ([`str`]) -- name of the config file that will store the class and module names of all
components of the diffusion pipeline.
- **config_name** ([`str`]) -- The configuration filename that stores the class and module names of all the
diffusion pipeline's components.
"""
config_name = "model_index.json"
......@@ -143,10 +142,9 @@ class FlaxDiffusionPipeline(ConfigMixin):
def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union[Dict, FrozenDict]):
# TODO: handle inference_state
"""
Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
method. The pipeline can easily be re-loaded using the `[`~FlaxDiffusionPipeline.from_pretrained`]` class
method.
Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
class implements both a save and loading method. The pipeline is easily reloaded using the
[`~FlaxDiffusionPipeline.from_pretrained`] class method.
Arguments:
save_directory (`str` or `os.PathLike`):
......@@ -193,70 +191,61 @@ class FlaxDiffusionPipeline(ConfigMixin):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
r"""
Instantiate a Flax diffusion pipeline from pre-trained pipeline weights.
Instantiate a Flax-based diffusion pipeline from pretrained pipeline weights.
The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
The pipeline is set in evaluation mode (`model.eval()) by default and dropout modules are deactivated.
The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.
If you get the error message below, you need to finetune the weights for your downstream task:
The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
weights are discarded.
```
Some weights of FlaxUNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
```
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
Can be either:
- A string, the *repo id* of a pretrained pipeline hosted inside a model repo on
https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
`CompVis/ldm-text2im-large-256`.
- A path to a *directory* containing pipeline weights saved using
[`~FlaxDiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`.
- A string, the *repo id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained pipeline
hosted on the Hub.
- A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
using [`~FlaxDiffusionPipeline.save_pretrained`].
dtype (`str` or `jnp.dtype`, *optional*):
Override the default `jnp.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
will be automatically derived from the model's weights.
Override the default `jnp.dtype` and load the model under this dtype. If `"auto"`, the dtype is
automatically derived from the model's weights.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
incompletely downloaded files are deleted.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(`bool`, *optional*, defaults to `False`):
Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (i.e., do not try to download the model).
local_files_only (`bool`, *optional*, defaults to `False`):
Whether to only load local model weights and configuration files or not. If set to `True`, the model
won't be downloaded from the Hub.
use_auth_token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
when running `huggingface-cli login` (stored in `~/.huggingface`).
The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
`diffusers-cli login` (stored in `~/.huggingface`) is used.
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
allowed by Git.
mirror (`str`, *optional*):
Mirror source to accelerate downloads in China. If you are from China and have an accessibility
problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
Please refer to the mirror site for more information. specify the folder name here.
Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
information.
kwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
specific pipeline class. The overwritten components are then directly passed to the pipelines
`__init__` method. See example below for more information.
<Tip>
It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
</Tip>
Can be used to overwrite load and saveable variables (the pipeline components) of the specific pipeline
class. The overwritten components are passed directly to the pipelines `__init__` method.
<Tip>
Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
this method in a firewalled environment.
To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
`huggingface-cli login`. You can also activate the special
[“offline-mode”](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
firewalled environment.
</Tip>
......@@ -540,7 +529,7 @@ class FlaxDiffusionPipeline(ConfigMixin):
@staticmethod
def numpy_to_pil(images):
"""
Convert a numpy image or a batch of images to a PIL image.
Convert a NumPy image or a batch of images to a PIL image.
"""
if images.ndim == 3:
images = images[None, ...]
......
......@@ -463,13 +463,13 @@ class DiffusionPipeline(ConfigMixin):
provides methods for loading, downloading and saving models. It also includes methods to:
- move all PyTorch modules to the device of your choice
- enabling/disabling the progress bar for the denoising iteration
- enable/disable the progress bar for the denoising iteration
Class attributes:
- **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
diffusion pipeline's components.
- **_optional_components** (List[`str`]) -- List of all optional components that don't have to be passed to the
- **_optional_components** (`List[str]`) -- List of all optional components that don't have to be passed to the
pipeline to function (should be overridden by subclasses).
"""
config_name = "model_index.json"
......@@ -1475,10 +1475,9 @@ class DiffusionPipeline(ConfigMixin):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
r"""
Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
inference. Speed up during training is not guaranteed.
Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). When this
option is enabled, you should observe lower GPU memory usage and a potential speed up during inference. Speed
up during training is not guaranteed.
<Tip warning={true}>
......@@ -1537,10 +1536,9 @@ class DiffusionPipeline(ConfigMixin):
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
r"""
Enable sliced attention computation.
When this option is enabled, the attention module splits the input tensor in slices to compute attention in
several steps. This is useful to save some memory in exchange for a small speed decrease.
Enable sliced attention computation. When this option is enabled, the attention module splits the input tensor
in slices to compute attention in several steps. This is useful to save some memory in exchange for a small
speed decrease.
Args:
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
......
......@@ -25,13 +25,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class PNDMPipeline(DiffusionPipeline):
r"""
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Pipeline for unconditional image generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
The `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`PNDMScheduler`]):
A `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
"""
unet: UNet2DModel
......@@ -55,22 +58,41 @@ class PNDMPipeline(DiffusionPipeline):
**kwargs,
) -> Union[ImagePipelineOutput, Tuple]:
r"""
The call function to the pipeline for generation.
Args:
batch_size (`int`, `optional`, defaults to 1): The number of images to generate.
batch_size (`int`, `optional`, defaults to 1):
The number of images to generate.
num_inference_steps (`int`, `optional`, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
generator (`torch.Generator`, `optional`): A [torch
generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
deterministic.
output_type (`str`, `optional`, defaults to `"pil"`): The output format of the generate image. Choose
between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, `optional`, defaults to `True`): Whether or not to return a
[`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
generator (`torch.Generator`, `optional`):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
output_type (`str`, `optional`, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from diffusers import PNDMPipeline
>>> # load model and scheduler
>>> pndm = PNDMPipeline.from_pretrained("google/ddpm-cifar10-32")
>>> # run pipeline in inference (sample random noise and denoise)
>>> image = pndm().images[0]
>>> # save image
>>> image.save("pndm_generated_image.png")
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
"""
# For more information on the sampling method you can take a look at Algorithm 2 of
# the official paper: https://arxiv.org/pdf/2202.09778.pdf
......
......@@ -77,6 +77,19 @@ def _preprocess_mask(mask: Union[List, PIL.Image.Image, torch.Tensor]):
class RePaintPipeline(DiffusionPipeline):
r"""
Pipeline for image inpainting using RePaint.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image latents.
scheduler ([`RePaintScheduler`]):
A `RePaintScheduler` to be used in combination with `unet` to denoise the encoded image.
"""
unet: UNet2DModel
scheduler: RePaintScheduler
......@@ -98,35 +111,77 @@ class RePaintPipeline(DiffusionPipeline):
return_dict: bool = True,
) -> Union[ImagePipelineOutput, Tuple]:
r"""
The call function to the pipeline for generation.
Args:
image (`torch.FloatTensor` or `PIL.Image.Image`):
The original image to inpaint on.
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
The mask_image where 0.0 values define which part of the original image to inpaint (change).
The mask_image where 0.0 define which part of the original image to inpaint.
num_inference_steps (`int`, *optional*, defaults to 1000):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
eta (`float`):
The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 - 0.0 is DDIM
and 1.0 is DDPM scheduler respectively.
The weight of the added noise in a diffusion step. Its value is between 0.0 and 1.0; 0.0 corresponds to
DDIM and 1.0 is the DDPM scheduler.
jump_length (`int`, *optional*, defaults to 10):
The number of steps taken forward in time before going backward in time for a single jump ("j" in
RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
RePaint paper). Take a look at Figure 9 and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
jump_n_sample (`int`, *optional*, defaults to 10):
The number of times we will make forward time jump for a given chosen time sample. Take a look at
Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
output_type (`str`, `optional`, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Example:
```py
>>> from io import BytesIO
>>> import torch
>>> import PIL
>>> import requests
>>> from diffusers import RePaintPipeline, RePaintScheduler
>>> def download_image(url):
... response = requests.get(url)
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
>>> img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
>>> mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
>>> # Load the original image and the mask as PIL images
>>> original_image = download_image(img_url).resize((256, 256))
>>> mask_image = download_image(mask_url).resize((256, 256))
>>> # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
>>> scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
>>> pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
>>> pipe = pipe.to("cuda")
>>> generator = torch.Generator(device="cuda").manual_seed(0)
>>> output = pipe(
... image=original_image,
... mask_image=mask_image,
... num_inference_steps=250,
... eta=0.0,
... jump_length=10,
... jump_n_sample=10,
... generator=generator,
... )
>>> inpainted_image = output.images[0]
```
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
"""
original_image = image
......
......@@ -24,11 +24,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
class ScoreSdeVePipeline(DiffusionPipeline):
r"""
Pipeline for unconditional image generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Parameters:
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. scheduler ([`SchedulerMixin`]):
The [`ScoreSdeVeScheduler`] scheduler to be used in combination with `unet` to denoise the encoded image.
unet ([`UNet2DModel`]):
A `UNet2DModel` to denoise the encoded image.
scheduler ([`ScoreSdeVeScheduler`]):
A `ScoreSdeVeScheduler` to be used in combination with `unet` to denoise the encoded image.
"""
unet: UNet2DModel
scheduler: ScoreSdeVeScheduler
......@@ -48,21 +53,23 @@ class ScoreSdeVePipeline(DiffusionPipeline):
**kwargs,
) -> Union[ImagePipelineOutput, Tuple]:
r"""
The call function to the pipeline for generation.
Args:
batch_size (`int`, *optional*, defaults to 1):
The number of images to generate.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
generator (`torch.Generator`, `optional`):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
output_type (`str`, `optional`, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
Returns:
[`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
[`~pipelines.ImagePipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images.
"""
img_size = self.unet.config.sample_size
......
......@@ -16,11 +16,11 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput):
Args:
images (`List[PIL.Image.Image]` or `np.ndarray`)
List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
num_channels)`.
nsfw_content_detected (`List[bool]`)
List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, or `None` if safety checking could not be performed.
List indicating whether the corresponding generated image contains not-safe-for-work” (nsfw) content or
`None` if safety checking could not be performed.
"""
images: Union[List[PIL.Image.Image], np.ndarray]
......
......@@ -16,78 +16,34 @@ from . import SemanticStableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
EXAMPLE_DOC_STRING = """
Examples:
```py
>>> import torch
>>> from diffusers import SemanticStableDiffusionPipeline
>>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
... )
>>> pipe = pipe.to("cuda")
>>> out = pipe(
... prompt="a photo of the face of a woman",
... num_images_per_prompt=1,
... guidance_scale=7,
... editing_prompt=[
... "smiling, smile", # Concepts to apply
... "glasses, wearing glasses",
... "curls, wavy hair, curly hair",
... "beard, full beard, mustache",
... ],
... reverse_editing_direction=[
... False,
... False,
... False,
... False,
... ], # Direction of guidance i.e. increase all concepts
... edit_warmup_steps=[10, 10, 10, 10], # Warmup period for each concept
... edit_guidance_scale=[4, 5, 5, 5.4], # Guidance scale for each concept
... edit_threshold=[
... 0.99,
... 0.975,
... 0.925,
... 0.96,
... ], # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
... edit_momentum_scale=0.3, # Momentum scale that will be added to the latent guidance
... edit_mom_beta=0.6, # Momentum beta
... edit_weights=[1, 1, 1, 1, 1], # Weights of the individual concepts against each other
... )
>>> image = out.images[0]
```
"""
class SemanticStableDiffusionPipeline(DiffusionPipeline):
r"""
Pipeline for text-to-image generation with latent editing.
Pipeline for text-to-image generation using Stable Diffusion with latent editing.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model builds on the implementation of ['StableDiffusionPipeline']
This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionPipeline`]. Check the superclass
documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
device, etc.).
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
text_encoder ([`CLIPTextModel`]):
Frozen text-encoder. Stable Diffusion uses the text portion of
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
tokenizer (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
text_encoder ([`~transformers.CLIPTextModel`]):
Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
tokenizer ([`~transformers.CLIPTokenizer`]):
A `CLIPTokenizer` to tokenize text.
unet ([`UNet2DConditionModel`]):
A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
safety_checker ([`Q16SafetyChecker`]):
Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
feature_extractor ([`CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
about a model's potential harms.
feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
"""
_optional_components = ["safety_checker", "feature_extractor"]
......@@ -277,97 +233,130 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
sem_guidance: Optional[List[torch.Tensor]] = None,
):
r"""
Function invoked when calling the pipeline for generation.
The call function to the pipeline for generation.
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
The prompt or prompts to guide image generation.
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
The prompt or prompts to guide what to not include in image generation. If not defined, you need to
pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
tensor is generated by sampling using the supplied random `generator`.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
plain tuple.
callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. The function will be
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
The frequency at which the `callback` function is called. If not specified, the callback is called at
every step.
editing_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to use for Semantic guidance. Semantic guidance is disabled by setting
The prompt or prompts to use for semantic guidance. Semantic guidance is disabled by setting
`editing_prompt = None`. Guidance direction of prompt should be specified via
`reverse_editing_direction`.
editing_prompt_embeddings (`torch.Tensor>`, *optional*):
editing_prompt_embeddings (`torch.Tensor`, *optional*):
Pre-computed embeddings to use for semantic guidance. Guidance direction of embedding should be
specified via `reverse_editing_direction`.
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
Guidance scale for semantic guidance. If provided as list values should correspond to `editing_prompt`.
`edit_guidance_scale` is defined as `s_e` of equation 6 of [SEGA
Paper](https://arxiv.org/pdf/2301.12247.pdf).
Guidance scale for semantic guidance. If provided as a list, values should correspond to
`editing_prompt`.
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
Number of diffusion steps (for each prompt) for which semantic guidance will not be applied. Momentum
will still be calculated for those steps and applied once all warmup periods are over.
`edit_warmup_steps` is defined as `delta` (δ) of [SEGA Paper](https://arxiv.org/pdf/2301.12247.pdf).
Number of diffusion steps (for each prompt) for which semantic guidance is not applied. Momentum is
calculated for those steps and applied once all warmup periods are over.
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
Number of diffusion steps (for each prompt) after which semantic guidance will no longer be applied.
Number of diffusion steps (for each prompt) after which semantic guidance is longer applied.
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
Threshold of semantic guidance.
edit_momentum_scale (`float`, *optional*, defaults to 0.1):
Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0
momentum will be disabled. Momentum is already built up during warmup, i.e. for diffusion steps smaller
than `sld_warmup_steps`. Momentum will only be added to latent guidance once all warmup periods are
finished. `edit_momentum_scale` is defined as `s_m` of equation 7 of [SEGA
Paper](https://arxiv.org/pdf/2301.12247.pdf).
Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0,
momentum is disabled. Momentum is already built up during warmup (for diffusion steps smaller than
`sld_warmup_steps`). Momentum is only added to latent guidance once all warmup periods are finished.
edit_mom_beta (`float`, *optional*, defaults to 0.4):
Defines how semantic guidance momentum builds up. `edit_mom_beta` indicates how much of the previous
momentum will be kept. Momentum is already built up during warmup, i.e. for diffusion steps smaller
than `edit_warmup_steps`. `edit_mom_beta` is defined as `beta_m` (β) of equation 8 of [SEGA
Paper](https://arxiv.org/pdf/2301.12247.pdf).
momentum is kept. Momentum is already built up during warmup (for diffusion steps smaller than
`edit_warmup_steps`).
edit_weights (`List[float]`, *optional*, defaults to `None`):
Indicates how much each individual concept should influence the overall guidance. If no weights are
provided all concepts are applied equally. `edit_mom_beta` is defined as `g_i` of equation 9 of [SEGA
Paper](https://arxiv.org/pdf/2301.12247.pdf).
provided all concepts are applied equally.
sem_guidance (`List[torch.Tensor]`, *optional*):
List of pre-generated guidance vectors to be applied at generation. Length of the list has to
correspond to `num_inference_steps`.
Examples:
```py
>>> import torch
>>> from diffusers import SemanticStableDiffusionPipeline
>>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
... )
>>> pipe = pipe.to("cuda")
>>> out = pipe(
... prompt="a photo of the face of a woman",
... num_images_per_prompt=1,
... guidance_scale=7,
... editing_prompt=[
... "smiling, smile", # Concepts to apply
... "glasses, wearing glasses",
... "curls, wavy hair, curly hair",
... "beard, full beard, mustache",
... ],
... reverse_editing_direction=[
... False,
... False,
... False,
... False,
... ], # Direction of guidance i.e. increase all concepts
... edit_warmup_steps=[10, 10, 10, 10], # Warmup period for each concept
... edit_guidance_scale=[4, 5, 5, 5.4], # Guidance scale for each concept
... edit_threshold=[
... 0.99,
... 0.975,
... 0.925,
... 0.96,
... ], # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
... edit_momentum_scale=0.3, # Momentum scale that will be added to the latent guidance
... edit_mom_beta=0.6, # Momentum beta
... edit_weights=[1, 1, 1, 1, 1], # Weights of the individual concepts against each other
... )
>>> image = out.images[0]
```
Returns:
[`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] if `return_dict` is True,
otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the
second element is a list of `bool`s denoting whether the corresponding generated image likely represents
"not-safe-for-work" (nsfw) content, according to the `safety_checker`.
If `return_dict` is `True`,
[`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] is returned, otherwise a
`tuple` is returned where the first element is a list with the generated images and the second element
is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work"
(nsfw) content.
"""
# 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor
......
......@@ -68,11 +68,11 @@ EXAMPLE_DOC_STRING = """
@dataclass
class ShapEPipelineOutput(BaseOutput):
"""
Output class for ShapEPipeline.
Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
Args:
images (`torch.FloatTensor`)
a list of images for 3D rendering
A list of images for 3D rendering.
"""
images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]]
......@@ -80,10 +80,10 @@ class ShapEPipelineOutput(BaseOutput):
class ShapEPipeline(DiffusionPipeline):
"""
Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E
Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
prior ([`PriorTransformer`]):
......@@ -91,13 +91,12 @@ class ShapEPipeline(DiffusionPipeline):
text_encoder ([`CLIPTextModelWithProjection`]):
Frozen text-encoder.
tokenizer (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
A [`~transformers.CLIPTokenizer`] to tokenize text.
scheduler ([`HeunDiscreteScheduler`]):
A scheduler to be used in combination with `prior` to generate image embedding.
shap_e_renderer ([`ShapERenderer`]):
Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
with the NeRF rendering method
with the NeRF rendering method.
"""
def __init__(
......@@ -132,10 +131,10 @@ class ShapEPipeline(DiffusionPipeline):
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
......@@ -222,7 +221,7 @@ class ShapEPipeline(DiffusionPipeline):
return_dict: bool = True,
):
"""
Function invoked when calling the pipeline for generation.
The call function to the pipeline for generation.
Args:
prompt (`str` or `List[str]`):
......@@ -233,30 +232,31 @@ class ShapEPipeline(DiffusionPipeline):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
tensor is generated by sampling using the supplied random `generator`.
guidance_scale (`float`, *optional*, defaults to 4.0):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
usually at the expense of lower image quality.
frame_size (`int`, *optional*, default to 64):
the width and height of each image frame of the generated 3d output
The width and height of each image frame of the generated 3D output.
output_type (`str`, *optional*, defaults to `"pt"`):
The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
(`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]).
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
tuple.
Examples:
Returns:
[`ShapEPipelineOutput`] or `tuple`
[`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
otherwise a `tuple` is returned where the first element is a list with the generated images.
"""
if isinstance(prompt, str):
......
......@@ -67,11 +67,11 @@ EXAMPLE_DOC_STRING = """
@dataclass
class ShapEPipelineOutput(BaseOutput):
"""
Output class for ShapEPipeline.
Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
Args:
images (`torch.FloatTensor`)
a list of images for 3D rendering
A list of images for 3D rendering.
"""
images: Union[PIL.Image.Image, np.ndarray]
......@@ -79,24 +79,24 @@ class ShapEPipelineOutput(BaseOutput):
class ShapEImg2ImgPipeline(DiffusionPipeline):
"""
Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E
Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E from an
image.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
prior ([`PriorTransformer`]):
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
text_encoder ([`CLIPTextModelWithProjection`]):
Frozen text-encoder.
tokenizer (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
image_encoder ([`CLIPVisionModel`]):
Frozen image-encoder.
image_processor (`CLIPImageProcessor`):
A [`~transformers.CLIPImageProcessor`] to process images.
scheduler ([`HeunDiscreteScheduler`]):
A scheduler to be used in combination with `prior` to generate image embedding.
shap_e_renderer ([`ShapERenderer`]):
Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
with the NeRF rendering method
with the NeRF rendering method.
"""
def __init__(
......@@ -174,40 +174,41 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
return_dict: bool = True,
):
"""
Function invoked when calling the pipeline for generation.
The call function to the pipeline for generation.
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
`Image` or tensor representing an image batch to be used as the starting point. Can also accept image
latents as `image`, if passing latents directly, it will not be encoded again.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
num_inference_steps (`int`, *optional*, defaults to 100):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
tensor is generated by sampling using the supplied random `generator`.
guidance_scale (`float`, *optional*, defaults to 4.0):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
frame_size (`int`, *optional*, default to 64):
the width and height of each image frame of the generated 3d output
The width and height of each image frame of the generated 3D output.
output_type (`str`, *optional*, defaults to `"pt"`):
(`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]).
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
tuple.
Examples:
Returns:
[`ShapEPipelineOutput`] or `tuple`
[`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
otherwise a `tuple` is returned where the first element is a list with the generated images.
"""
if isinstance(image, PIL.Image.Image):
......
......@@ -38,6 +38,21 @@ TARGET_FEATURE_LENGTH = 256
class SpectrogramDiffusionPipeline(DiffusionPipeline):
r"""
Pipeline for unconditional audio generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
notes_encoder ([`SpectrogramNotesEncoder`]):
continuous_encoder ([`SpectrogramContEncoder`]):
decoder ([`T5FilmDecoder`]):
A [`T5FilmDecoder`] to denoise the encoded audio latents.
scheduler ([`DDPMScheduler`]):
A scheduler to be used in combination with `decoder` to denoise the encoded audio latents.
melgan ([`OnnxRuntimeModel`]):
"""
_optional_components = ["melgan"]
def __init__(
......@@ -127,6 +142,48 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
f" {type(callback_steps)}."
)
r"""
The call function to the pipeline for generation.
Args:
input_tokens (`List[List[int]]`):
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
num_inference_steps (`int`, *optional*, defaults to 100):
The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
expense of slower inference.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
output_type (`str`, *optional*, defaults to `"numpy"`):
The output format of the generated audio.
callback (`Callable`, *optional*):
A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function is called. If not specified, the callback is called at
every step.
Example:
```py
>>> from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
>>> pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
>>> pipe = pipe.to("cuda")
>>> processor = MidiProcessor()
>>> # Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
>>> output = pipe(processor("beethoven_hammerklavier_2.mid"))
>>> audio = output.audios[0]
```
Returns:
[`pipelines.AudioPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated audio.
"""
pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
......
......@@ -25,11 +25,11 @@ class StableDiffusionPipelineOutput(BaseOutput):
Args:
images (`List[PIL.Image.Image]` or `np.ndarray`)
List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
num_channels)`.
nsfw_content_detected (`List[bool]`)
List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, or `None` if safety checking could not be performed.
List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
`None` if safety checking could not be performed.
"""
images: Union[List[PIL.Image.Image], np.ndarray]
......@@ -116,14 +116,14 @@ if is_transformers_available() and is_flax_available():
@flax.struct.dataclass
class FlaxStableDiffusionPipelineOutput(BaseOutput):
"""
Output class for Stable Diffusion pipelines.
Output class for Flax-based Stable Diffusion pipelines.
Args:
images (`np.ndarray`)
Array of shape `(batch_size, height, width, num_channels)` with images from the diffusion pipeline.
nsfw_content_detected (`List[bool]`)
List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content.
images (`np.ndarray`):
Denoised images of array shape of `(batch_size, height, width, num_channels)`.
nsfw_content_detected (`List[bool]`):
List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
or `None` if safety checking could not be performed.
"""
images: np.ndarray
......
......@@ -130,28 +130,27 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
r"""
Pipeline for text-guided image to image generation using Stable Diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
text_encoder ([`CLIPTextModel`]):
Frozen text-encoder. Stable Diffusion uses the text portion of
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
tokenizer (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
text_encoder ([`~transformers.CLIPTextModel`]):
Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
tokenizer ([`~transformers.CLIPTokenizer`]):
A `CLIPTokenizer` to tokenize text.
unet ([`UNet2DConditionModel`]):
A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can only be an
instance of [`DDIMScheduler`].
safety_checker ([`StableDiffusionSafetyChecker`]):
Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
feature_extractor ([`CLIPImageProcessor`]):
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
about a model's potential harms.
feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
"""
_optional_components = ["safety_checker", "feature_extractor"]
......@@ -234,10 +233,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
......@@ -595,71 +594,134 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
):
r"""
Function invoked when calling the pipeline for generation.
The call function to the pipeline for generation.
Args:
prompt (`str` or `List[str]`):
The prompt or prompts to guide the image generation.
image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
`Image`, or tensor representing an image batch, that will be used as the starting point for the
process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded
again.
`Image` or tensor representing an image batch to be used as the starting point. Can also accept image
latents as `image`, but if passing latents directly it is not encoded again.
strength (`float`, *optional*, defaults to 0.8):
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
be maximum and the denoising process will run for the full number of iterations specified in
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
starting point and more noise is added the higher the `strength`. The number of denoising steps depends
on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
essentially ignores `image`.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. This parameter will be modulated by `strength`.
expense of slower inference. This parameter is modulated by `strength`.
guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
A higher guidance scale value encourages the model to generate images closely linked to the text
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
source_guidance_scale (`float`, *optional*, defaults to 1):
Guidance scale for the source prompt. This is useful to control the amount of influence the source
prompt for encoding.
prompt has for encoding.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.1):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
generator (`torch.Generator`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
plain tuple.
callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. The function will be
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
The frequency at which the `callback` function is called. If not specified, the callback is called at
every step.
cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
`self.processor` in
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
Example:
```py
import requests
import torch
from PIL import Image
from io import BytesIO
from diffusers import CycleDiffusionPipeline, DDIMScheduler
# load the pipeline
# make sure you're logged in with `huggingface-cli login`
model_id_or_path = "CompVis/stable-diffusion-v1-4"
scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
# let's download an initial image
url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
init_image.save("horse.png")
# let's specify a prompt
source_prompt = "An astronaut riding a horse"
prompt = "An astronaut riding an elephant"
# call the pipeline
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.8,
guidance_scale=2,
source_guidance_scale=1,
).images[0]
image.save("horse_to_elephant.png")
# let's try another example
# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
url = (
"https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
)
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
init_image.save("black.png")
source_prompt = "A black colored car"
prompt = "A blue colored car"
# call the pipeline
torch.manual_seed(0)
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.85,
guidance_scale=3,
source_guidance_scale=1,
).images[0]
image.save("black_to_blue.png")
```
Returns:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
When returning a tuple, the first element is a list with the generated images, and the second element is a
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, according to the `safety_checker`.
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
otherwise a `tuple` is returned where the first element is a list with the generated images and the
second element is a list of `bool`s indicating whether the corresponding generated image contains
"not-safe-for-work" (nsfw) content.
"""
# 1. Check inputs
self.check_inputs(prompt, strength, callback_steps)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment