[docs] Clean up pipeline apis (#3905)

* start with stable diffusion * fix * finish stable diffusion pipelines * fix path to pipeline output * fix flax paths * fix copies * add up to score sde ve * finish first pass of pipelines * fix copies * second review * align doc titles * more review fixes * final review

[docs] Clean up pipeline apis (#3905)
* start with stable diffusion * fix * finish stable diffusion pipelines * fix path to pipeline output * fix flax paths * fix copies * add up to score sde ve * finish first pass of pipelines * fix copies * second review * align doc titles * more review fixes * final review
a69754bb · Steven Liu · GitHub · bcc570b9 · a69754bb · a69754bb
Unverified Commit a69754bb authored Jul 21, 2023 by Steven Liu Committed by GitHub Jul 21, 2023
20 changed files
--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -26,13 +26,16 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

 class DanceDiffusionPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for audio generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
-        unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded audio.
+        unet ([`UNet1DModel`]):
+            A `UNet1DModel` to denoise the encoded audio.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded audio. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
            [`IPNDMScheduler`].
    """

@@ -50,6 +53,8 @@ class DanceDiffusionPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[AudioPipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                The number of audio samples to generate.
@@ -57,17 +62,40 @@ class DanceDiffusionPipeline(DiffusionPipeline):
                The number of denoising steps. More denoising steps usually lead to a higher-quality audio sample at
                the expense of slower inference.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
-                The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.*
-                `sample_size`, will be `audio_length_in_s` * `self.unet.config.sample_rate`.
+                The length of the generated audio sample in seconds.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.

+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        from scipy.io.wavfile import write
+
+        model_id = "harmonai/maestro-150k"
+        pipe = DiffusionPipeline.from_pretrained(model_id)
+        pipe = pipe.to("cuda")
+
+        audios = pipe(audio_length_in_s=4.0).audios
+
+        # To save locally
+        for i, audio in enumerate(audios):
+            write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
+
+        # To dislay in google colab
+        import IPython.display as ipd
+
+        for audio in audios:
+            display(ipd.Audio(audio, rate=pipe.unet.sample_rate))
+        ```
+
        Returns:
-            [`~pipelines.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated audio.
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
        """

        if audio_length_in_s is None:

--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class DDIMPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
            [`DDPMScheduler`], or [`DDIMScheduler`].
@@ -53,29 +56,56 @@ class DDIMPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                The number of images to generate.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to
+                DDIM and `1` corresponds to DDPM.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            use_clipped_model_output (`bool`, *optional*, defaults to `None`):
-                if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed
-                downstream to the scheduler. So use `None` for schedulers which don't support this argument.
+                If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed
+                downstream to the scheduler (use `None` for schedulers which don't support this argument).
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

+        Example:
+
+        ```py
+        >>> from diffusers import DDIMPipeline
+        >>> import PIL.Image
+        >>> import numpy as np
+
+        >>> # load model and scheduler
+        >>> pipe = DDIMPipeline.from_pretrained("fusing/ddim-lsun-bedroom")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe(eta=0.0, num_inference_steps=50)
+
+        >>> # process image to PIL
+        >>> image_processed = image.cpu().permute(0, 2, 3, 1)
+        >>> image_processed = (image_processed + 1.0) * 127.5
+        >>> image_processed = image_processed.numpy().astype(np.uint8)
+        >>> image_pil = PIL.Image.fromarray(image_processed[0])
+
+        >>> # save image
+        >>> image_pil.save("test.png")
+        ```
+
        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """

        # Sample gaussian noise to begin loop

--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class DDPMPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
            [`DDPMScheduler`], or [`DDIMScheduler`].
@@ -47,24 +50,41 @@ class DDPMPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                The number of images to generate.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            num_inference_steps (`int`, *optional*, defaults to 1000):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

+        Example:
+
+        ```py
+        >>> from diffusers import DDPMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+
+        >>> # save image
+        >>> image.save("ddpm_generated_image.png")
+        ```
+
        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """
        # Sample gaussian noise to begin loop
        if isinstance(self.unet.config.sample_size, int):

--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -30,16 +30,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class DiTPipeline(DiffusionPipeline):
    r"""
-    This pipeline inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for image generation based on a Transformer backbone instead of a UNet.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        transformer ([`Transformer2DModel`]):
-            Class conditioned Transformer in Diffusion model to denoise the encoded image latents.
+            A class conditioned `Transformer2DModel` to denoise the encoded image latents.
        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        scheduler ([`DDIMScheduler`]):
-            A scheduler to be used in combination with `dit` to denoise the encoded image latents.
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    """

    def __init__(
@@ -63,13 +65,15 @@ class DiTPipeline(DiffusionPipeline):
    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
        r"""

-        Map label strings, *e.g.* from ImageNet, to corresponding class ids.
+        Map label strings from ImageNet to corresponding class ids.

        Parameters:
-            label (`str` or `dict` of `str`): label strings to be mapped to class ids.
+            label (`str` or `dict` of `str`):
+                Label strings to be mapped to class ids.

        Returns:
-            `list` of `int`: Class ids to be processed by pipeline.
+            `list` of `int`:
+                Class ids to be processed by pipeline.
        """

        if not isinstance(label, list):
@@ -94,24 +98,53 @@ class DiTPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.

        Args:
            class_labels (List[int]):
-                List of imagenet class labels for the images to be generated.
+                List of ImageNet class labels for the images to be generated.
            guidance_scale (`float`, *optional*, defaults to 4.0):
-                Scale of the guidance signal.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            num_inference_steps (`int`, *optional*, defaults to 250):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
+        >>> import torch
+
+        >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # pick words from Imagenet class labels
+        >>> pipe.labels  # to print all available words
+
+        >>> # pick words that exist in ImageNet
+        >>> words = ["white shark", "umbrella"]
+
+        >>> class_ids = pipe.get_label_ids(words)
+
+        >>> generator = torch.manual_seed(33)
+        >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
+
+        >>> image = output.images[0]  # label 'white shark'
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """

        batch_size = len(class_labels)

--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -31,18 +31,20 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class LDMTextToImagePipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for text-to-image generation using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
        bert ([`LDMBertModel`]):
-            Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture.
-        tokenizer (`transformers.BertTokenizer`):
-            Tokenizer of class
-            [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
@@ -76,38 +78,54 @@ class LDMTextToImagePipeline(DiffusionPipeline):
        **kwargs,
    ) -> Union[Tuple, ImagePipelineOutput]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 1.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt` at
-                the, usually at the expense of lower image quality.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
+                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DiffusionPipeline
+
+        >>> # load model and scheduler
+        >>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> prompt = "A painting of a squirrel eating a burger"
+        >>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images
+
+        >>> # save images
+        >>> for idx, image in enumerate(images):
+        ...     image.save(f"squirrel-{idx}.png")
+        ```

        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor

--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -31,15 +31,16 @@ def preprocess(image):

 class LDMSuperResolutionPipeline(DiffusionPipeline):
    r"""
-    A pipeline for image super-resolution using Latent
+    A pipeline for image super-resolution using latent diffusion.

-    This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations.
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
@@ -74,30 +75,58 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[Tuple, ImagePipelineOutput]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            image (`torch.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
+                `Image` or tensor representing an image batch to be used as the starting point for the process.
            batch_size (`int`, *optional*, defaults to 1):
                Number of images to generate.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> import requests
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> from diffusers import LDMSuperResolutionPipeline
+        >>> import torch
+
+        >>> # load model and scheduler
+        >>> pipeline = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
+        >>> pipeline = pipeline.to("cuda")
+
+        >>> # let's download an  image
+        >>> url = (
+        ...     "https://user-images.githubusercontent.com/38061659/199705896-b48e17b8-b231-47cd-a270-4ffa5a93fa3e.png"
+        ... )
+        >>> response = requests.get(url)
+        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> low_res_img = low_res_img.resize((128, 128))
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> upscaled_image = pipeline(low_res_img, num_inference_steps=100, eta=1).images[0]
+        >>> # save image
+        >>> upscaled_image.save("ldm_generated_image.png")
+        ```

        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """
        if isinstance(image, PIL.Image.Image):
            batch_size = 1

--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -25,15 +25,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class LDMPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for unconditional image generation using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents.
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents.
+            [`DDIMScheduler`] is used in combination with `unet` to denoise the encoded image latents.
    """

    def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
@@ -52,24 +55,38 @@ class LDMPipeline(DiffusionPipeline):
        **kwargs,
    ) -> Union[Tuple, ImagePipelineOutput]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                Number of images to generate.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

+        Example:
+
+        ```py
+        >>> from diffusers import LDMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+        ```
+
        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """

        latents = randn_tensor(

--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -136,28 +136,36 @@ def prepare_mask_and_masked_image(image, mask):

 class PaintByExamplePipeline(DiffusionPipeline):
    r"""
-    Pipeline for image-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+    <Tip warning={true}>

-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    🧪 This is an experimental feature!
+
+    </Tip>
+
+    Pipeline for image-guided image inpainting using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        image_encoder ([`PaintByExampleImageEncoder`]):
-            Encodes the example input image. The unet is conditioned on the example image instead of a text prompt.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+            Encodes the example input image. The `unet` is conditioned on the example image instead of a text prompt.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+
    """
    # TODO: feature_extractor is required to encode initial images (if they are in PIL format),
    # we should give a descriptive message if the pipeline doesn't have one.
@@ -378,66 +386,99 @@ class PaintByExamplePipeline(DiffusionPipeline):
        callback_steps: int = 1,
    ):
        r"""
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.

        Args:
            example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
-                The exemplar image to guide the image generation.
+                An example image to guide image generation.
            image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
-                be masked out with `mask_image` and repainted according to `prompt`.
+                `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with
+                `mask_image` and repainted according to `prompt`).
            mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
-                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
-                instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted,
+                while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
+                (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
+                expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
+                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Example:
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import PaintByExamplePipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = (
+        ...     "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
+        ... )
+        >>> mask_url = (
+        ...     "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
+        ... )
+        >>> example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
+
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+        >>> example_image = download_image(example_url).resize((512, 512))
+
+        >>> pipe = PaintByExamplePipeline.from_pretrained(
+        ...     "Fantasy-Studio/Paint-by-Example",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
+        >>> image
+        ```

        Returns:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
        """
        # 1. Define call parameters
        if isinstance(image, PIL.Image.Image):

--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -92,18 +92,17 @@ class FlaxImagePipelineOutput(BaseOutput):

 class FlaxDiffusionPipeline(ConfigMixin):
    r"""
-    Base class for all models.
+    Base class for Flax-based pipelines.

-    [`FlaxDiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion
-    pipelines and handles methods for loading, downloading and saving models as well as a few methods common to all
-    pipelines to:
+    [`FlaxDiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and
+    provides methods for loading, downloading and saving models. It also includes methods to:

-        - enabling/disabling the progress bar for the denoising iteration
+        - enable/disable the progress bar for the denoising iteration

    Class attributes:

-        - **config_name** ([`str`]) -- name of the config file that will store the class and module names of all
-          components of the diffusion pipeline.
+        - **config_name** ([`str`]) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
    """
    config_name = "model_index.json"

@@ -143,10 +142,9 @@ class FlaxDiffusionPipeline(ConfigMixin):
    def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union[Dict, FrozenDict]):
        # TODO: handle inference_state
        """
-        Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
-        a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
-        method. The pipeline can easily be re-loaded using the `[`~FlaxDiffusionPipeline.from_pretrained`]` class
-        method.
+        Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
+        class implements both a save and loading method. The pipeline is easily reloaded using the
+        [`~FlaxDiffusionPipeline.from_pretrained`] class method.

        Arguments:
            save_directory (`str` or `os.PathLike`):
@@ -193,70 +191,61 @@ class FlaxDiffusionPipeline(ConfigMixin):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
        r"""
-        Instantiate a Flax diffusion pipeline from pre-trained pipeline weights.
+        Instantiate a Flax-based diffusion pipeline from pretrained pipeline weights.

-        The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
+        The pipeline is set in evaluation mode (`model.eval()) by default and dropout modules are deactivated.

-        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
+        If you get the error message below, you need to finetune the weights for your downstream task:

-        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
+        ```
+        Some weights of FlaxUNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        ```

        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                Can be either:

-                    - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on
-                      https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
-                      `CompVis/ldm-text2im-large-256`.
-                    - A path to a *directory* containing pipeline weights saved using
-                      [`~FlaxDiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`.
+                    - A string, the *repo id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      using [`~FlaxDiffusionPipeline.save_pretrained`].
            dtype (`str` or `jnp.dtype`, *optional*):
-                Override the default `jnp.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
+                Override the default `jnp.dtype` and load the model under this dtype. If `"auto"`, the dtype is
+                automatically derived from the model's weights.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`).
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information. specify the folder name here.
-
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
-                specific pipeline class. The overwritten components are then directly passed to the pipelines
-                `__init__` method. See example below for more information.
-
-        <Tip>
-
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
-
-        </Tip>
+                Can be used to overwrite load and saveable variables (the pipeline components) of the specific pipeline
+                class. The overwritten components are passed directly to the pipelines `__init__` method.

        <Tip>

-        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
-        this method in a firewalled environment.
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        [“offline-mode”](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.

        </Tip>

@@ -540,7 +529,7 @@ class FlaxDiffusionPipeline(ConfigMixin):
    @staticmethod
    def numpy_to_pil(images):
        """
-        Convert a numpy image or a batch of images to a PIL image.
+        Convert a NumPy image or a batch of images to a PIL image.
        """
        if images.ndim == 3:
            images = images[None, ...]

--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -463,13 +463,13 @@ class DiffusionPipeline(ConfigMixin):
    provides methods for loading, downloading and saving models. It also includes methods to:

        - move all PyTorch modules to the device of your choice
-        - enabling/disabling the progress bar for the denoising iteration
+        - enable/disable the progress bar for the denoising iteration

    Class attributes:

        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
          diffusion pipeline's components.
-        - **_optional_components** (List[`str`]) -- List of all optional components that don't have to be passed to the
+        - **_optional_components** (`List[str]`) -- List of all optional components that don't have to be passed to the
          pipeline to function (should be overridden by subclasses).
    """
    config_name = "model_index.json"
@@ -1475,10 +1475,9 @@ class DiffusionPipeline(ConfigMixin):

    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
        r"""
-        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
-
-        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
-        inference. Speed up during training is not guaranteed.
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). When this
+        option is enabled, you should observe lower GPU memory usage and a potential speed up during inference. Speed
+        up during training is not guaranteed.

        <Tip warning={true}>

@@ -1537,10 +1536,9 @@ class DiffusionPipeline(ConfigMixin):

    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
-        several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Enable sliced attention computation. When this option is enabled, the attention module splits the input tensor
+        in slices to compute attention in several steps. This is useful to save some memory in exchange for a small
+        speed decrease.

        Args:
            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):

--- a/src/diffusers/pipelines/pndm/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py
@@ -25,13 +25,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class PNDMPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for unconditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
-        unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            The `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`PNDMScheduler`]):
+            A `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
    """

    unet: UNet2DModel
@@ -55,22 +58,41 @@ class PNDMPipeline(DiffusionPipeline):
        **kwargs,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
-            batch_size (`int`, `optional`, defaults to 1): The number of images to generate.
+            batch_size (`int`, `optional`, defaults to 1):
+                The number of images to generate.
            num_inference_steps (`int`, `optional`, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
-            generator (`torch.Generator`, `optional`): A [torch
-                generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            output_type (`str`, `optional`, defaults to `"pil"`): The output format of the generate image. Choose
-                between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, `optional`, defaults to `True`): Whether or not to return a
-                [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            generator (`torch.Generator`, `optional`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import PNDMPipeline
+
+        >>> # load model and scheduler
+        >>> pndm = PNDMPipeline.from_pretrained("google/ddpm-cifar10-32")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pndm().images[0]
+
+        >>> # save image
+        >>> image.save("pndm_generated_image.png")
+        ```

        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
        """
        # For more information on the sampling method you can take a look at Algorithm 2 of
        # the official paper: https://arxiv.org/pdf/2202.09778.pdf

--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -77,6 +77,19 @@ def _preprocess_mask(mask: Union[List, PIL.Image.Image, torch.Tensor]):


 class RePaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image inpainting using RePaint.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`RePaintScheduler`]):
+            A `RePaintScheduler` to be used in combination with `unet` to denoise the encoded image.
+    """
+
    unet: UNet2DModel
    scheduler: RePaintScheduler

@@ -98,35 +111,77 @@ class RePaintPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            image (`torch.FloatTensor` or `PIL.Image.Image`):
                The original image to inpaint on.
            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                The mask_image where 0.0 values define which part of the original image to inpaint (change).
+                The mask_image where 0.0 define which part of the original image to inpaint.
            num_inference_steps (`int`, *optional*, defaults to 1000):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            eta (`float`):
-                The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 - 0.0 is DDIM
-                and 1.0 is DDPM scheduler respectively.
+                The weight of the added noise in a diffusion step. Its value is between 0.0 and 1.0; 0.0 corresponds to
+                DDIM and 1.0 is the DDPM scheduler.
            jump_length (`int`, *optional*, defaults to 10):
                The number of steps taken forward in time before going backward in time for a single jump ("j" in
-                RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+                RePaint paper). Take a look at Figure 9 and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
            jump_n_sample (`int`, *optional*, defaults to 10):
-                The number of times we will make forward time jump for a given chosen time sample. Take a look at
-                Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+                The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
+                and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from io import BytesIO
+        >>> import torch
+        >>> import PIL
+        >>> import requests
+        >>> from diffusers import RePaintPipeline, RePaintScheduler
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
+        >>> mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
+
+        >>> # Load the original image and the mask as PIL images
+        >>> original_image = download_image(img_url).resize((256, 256))
+        >>> mask_image = download_image(mask_url).resize((256, 256))
+
+        >>> # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
+        >>> scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
+        >>> pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> output = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     num_inference_steps=250,
+        ...     eta=0.0,
+        ...     jump_length=10,
+        ...     jump_n_sample=10,
+        ...     generator=generator,
+        ... )
+        >>> inpainted_image = output.images[0]
+        ```

        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
        """

        original_image = image

--- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -24,11 +24,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class ScoreSdeVePipeline(DiffusionPipeline):
    r"""
+    Pipeline for unconditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
    Parameters:
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. scheduler ([`SchedulerMixin`]):
-            The [`ScoreSdeVeScheduler`] scheduler to be used in combination with `unet` to denoise the encoded image.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
+        scheduler ([`ScoreSdeVeScheduler`]):
+            A `ScoreSdeVeScheduler` to be used in combination with `unet` to denoise the encoded image.
    """
    unet: UNet2DModel
    scheduler: ScoreSdeVeScheduler
@@ -48,21 +53,23 @@ class ScoreSdeVePipeline(DiffusionPipeline):
        **kwargs,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                The number of images to generate.
-            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            generator (`torch.Generator`, `optional`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.

        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
        """

        img_size = self.unet.config.sample_size

--- a/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -16,11 +16,11 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput):

    Args:
        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
    """

    images: Union[List[PIL.Image.Image], np.ndarray]

--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -16,78 +16,34 @@ from . import SemanticStableDiffusionPipelineOutput


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import SemanticStableDiffusionPipeline
-
-        >>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
-        ... )
-        >>> pipe = pipe.to("cuda")
-
-        >>> out = pipe(
-        ...     prompt="a photo of the face of a woman",
-        ...     num_images_per_prompt=1,
-        ...     guidance_scale=7,
-        ...     editing_prompt=[
-        ...         "smiling, smile",  # Concepts to apply
-        ...         "glasses, wearing glasses",
-        ...         "curls, wavy hair, curly hair",
-        ...         "beard, full beard, mustache",
-        ...     ],
-        ...     reverse_editing_direction=[
-        ...         False,
-        ...         False,
-        ...         False,
-        ...         False,
-        ...     ],  # Direction of guidance i.e. increase all concepts
-        ...     edit_warmup_steps=[10, 10, 10, 10],  # Warmup period for each concept
-        ...     edit_guidance_scale=[4, 5, 5, 5.4],  # Guidance scale for each concept
-        ...     edit_threshold=[
-        ...         0.99,
-        ...         0.975,
-        ...         0.925,
-        ...         0.96,
-        ...     ],  # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
-        ...     edit_momentum_scale=0.3,  # Momentum scale that will be added to the latent guidance
-        ...     edit_mom_beta=0.6,  # Momentum beta
-        ...     edit_weights=[1, 1, 1, 1, 1],  # Weights of the individual concepts against each other
-        ... )
-        >>> image = out.images[0]
-        ```
-"""


 class SemanticStableDiffusionPipeline(DiffusionPipeline):
    r"""
-    Pipeline for text-to-image generation with latent editing.
+    Pipeline for text-to-image generation using Stable Diffusion with latent editing.

-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    This model builds on the implementation of ['StableDiffusionPipeline']
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionPipeline`]. Check the superclass
+    documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
+    device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        safety_checker ([`Q16SafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """

    _optional_components = ["safety_checker", "feature_extractor"]
@@ -277,97 +233,130 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
        sem_guidance: Optional[List[torch.Tensor]] = None,
    ):
        r"""
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
+                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
            editing_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to use for Semantic guidance. Semantic guidance is disabled by setting
+                The prompt or prompts to use for semantic guidance. Semantic guidance is disabled by setting
                `editing_prompt = None`. Guidance direction of prompt should be specified via
                `reverse_editing_direction`.
-            editing_prompt_embeddings (`torch.Tensor>`, *optional*):
+            editing_prompt_embeddings (`torch.Tensor`, *optional*):
                Pre-computed embeddings to use for semantic guidance. Guidance direction of embedding should be
                specified via `reverse_editing_direction`.
            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
-                Guidance scale for semantic guidance. If provided as list values should correspond to `editing_prompt`.
-                `edit_guidance_scale` is defined as `s_e` of equation 6 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
+                Guidance scale for semantic guidance. If provided as a list, values should correspond to
+                `editing_prompt`.
            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
-                Number of diffusion steps (for each prompt) for which semantic guidance will not be applied. Momentum
-                will still be calculated for those steps and applied once all warmup periods are over.
-                `edit_warmup_steps` is defined as `delta` (δ) of [SEGA Paper](https://arxiv.org/pdf/2301.12247.pdf).
+                Number of diffusion steps (for each prompt) for which semantic guidance is not applied. Momentum is
+                calculated for those steps and applied once all warmup periods are over.
            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
-                Number of diffusion steps (for each prompt) after which semantic guidance will no longer be applied.
+                Number of diffusion steps (for each prompt) after which semantic guidance is longer applied.
            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
                Threshold of semantic guidance.
            edit_momentum_scale (`float`, *optional*, defaults to 0.1):
-                Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0
-                momentum will be disabled. Momentum is already built up during warmup, i.e. for diffusion steps smaller
-                than `sld_warmup_steps`. Momentum will only be added to latent guidance once all warmup periods are
-                finished. `edit_momentum_scale` is defined as `s_m` of equation 7 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
+                Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0,
+                momentum is disabled. Momentum is already built up during warmup (for diffusion steps smaller than
+                `sld_warmup_steps`). Momentum is only added to latent guidance once all warmup periods are finished.
            edit_mom_beta (`float`, *optional*, defaults to 0.4):
                Defines how semantic guidance momentum builds up. `edit_mom_beta` indicates how much of the previous
-                momentum will be kept. Momentum is already built up during warmup, i.e. for diffusion steps smaller
-                than `edit_warmup_steps`. `edit_mom_beta` is defined as `beta_m` (β) of equation 8 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
+                momentum is kept. Momentum is already built up during warmup (for diffusion steps smaller than
+                `edit_warmup_steps`).
            edit_weights (`List[float]`, *optional*, defaults to `None`):
                Indicates how much each individual concept should influence the overall guidance. If no weights are
-                provided all concepts are applied equally. `edit_mom_beta` is defined as `g_i` of equation 9 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
+                provided all concepts are applied equally.
            sem_guidance (`List[torch.Tensor]`, *optional*):
                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
                correspond to `num_inference_steps`.

+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import SemanticStableDiffusionPipeline
+
+        >>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> out = pipe(
+        ...     prompt="a photo of the face of a woman",
+        ...     num_images_per_prompt=1,
+        ...     guidance_scale=7,
+        ...     editing_prompt=[
+        ...         "smiling, smile",  # Concepts to apply
+        ...         "glasses, wearing glasses",
+        ...         "curls, wavy hair, curly hair",
+        ...         "beard, full beard, mustache",
+        ...     ],
+        ...     reverse_editing_direction=[
+        ...         False,
+        ...         False,
+        ...         False,
+        ...         False,
+        ...     ],  # Direction of guidance i.e. increase all concepts
+        ...     edit_warmup_steps=[10, 10, 10, 10],  # Warmup period for each concept
+        ...     edit_guidance_scale=[4, 5, 5, 5.4],  # Guidance scale for each concept
+        ...     edit_threshold=[
+        ...         0.99,
+        ...         0.975,
+        ...         0.925,
+        ...         0.96,
+        ...     ],  # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
+        ...     edit_momentum_scale=0.3,  # Momentum scale that will be added to the latent guidance
+        ...     edit_mom_beta=0.6,  # Momentum beta
+        ...     edit_weights=[1, 1, 1, 1, 1],  # Weights of the individual concepts against each other
+        ... )
+        >>> image = out.images[0]
+        ```
+
        Returns:
            [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] if `return_dict` is True,
-            otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the
-            second element is a list of `bool`s denoting whether the corresponding generated image likely represents
-            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+                If `return_dict` is `True`,
+                [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images and the second element
+                is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work"
+                (nsfw) content.
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor

--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -68,11 +68,11 @@ EXAMPLE_DOC_STRING = """
 @dataclass
 class ShapEPipelineOutput(BaseOutput):
    """
-    Output class for ShapEPipeline.
+    Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].

    Args:
        images (`torch.FloatTensor`)
-            a list of images for 3D rendering
+            A list of images for 3D rendering.
    """

    images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]]
@@ -80,10 +80,10 @@ class ShapEPipelineOutput(BaseOutput):

 class ShapEPipeline(DiffusionPipeline):
    """
-    Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E
+    Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E.

-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        prior ([`PriorTransformer`]):
@@ -91,13 +91,12 @@ class ShapEPipeline(DiffusionPipeline):
        text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+             A [`~transformers.CLIPTokenizer`] to tokenize text.
        scheduler ([`HeunDiscreteScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
        shap_e_renderer ([`ShapERenderer`]):
            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
-            with the NeRF rendering method
+            with the NeRF rendering method.
    """

    def __init__(
@@ -132,10 +131,10 @@ class ShapEPipeline(DiffusionPipeline):

    def enable_model_cpu_offload(self, gpu_id=0):
        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
+        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
+        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
+        iterative execution of the `unet`.
        """
        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
            from accelerate import cpu_offload_with_hook
@@ -222,7 +221,7 @@ class ShapEPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ):
        """
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
@@ -233,30 +232,31 @@ class ShapEPipeline(DiffusionPipeline):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
+                tensor is generated by sampling using the supplied random `generator`.
            guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
                usually at the expense of lower image quality.
            frame_size (`int`, *optional*, default to 64):
-                the width and height of each image frame of the generated 3d output
+                The width and height of each image frame of the generated 3D output.
            output_type (`str`, *optional*, defaults to `"pt"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]).
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
+                tuple.

        Examples:

        Returns:
-            [`ShapEPipelineOutput`] or `tuple`
+            [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
        """

        if isinstance(prompt, str):

--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -67,11 +67,11 @@ EXAMPLE_DOC_STRING = """
 @dataclass
 class ShapEPipelineOutput(BaseOutput):
    """
-    Output class for ShapEPipeline.
+    Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].

    Args:
        images (`torch.FloatTensor`)
-            a list of images for 3D rendering
+            A list of images for 3D rendering.
    """

    images: Union[PIL.Image.Image, np.ndarray]
@@ -79,24 +79,24 @@ class ShapEPipelineOutput(BaseOutput):

 class ShapEImg2ImgPipeline(DiffusionPipeline):
    """
-    Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E
+    Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E from an
+    image.

-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        prior ([`PriorTransformer`]):
            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
-        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        image_encoder ([`CLIPVisionModel`]):
+            Frozen image-encoder.
+        image_processor (`CLIPImageProcessor`):
+             A [`~transformers.CLIPImageProcessor`] to process images.
        scheduler ([`HeunDiscreteScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
        shap_e_renderer ([`ShapERenderer`]):
            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
-            with the NeRF rendering method
+            with the NeRF rendering method.
    """

    def __init__(
@@ -174,40 +174,41 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ):
        """
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                latents as `image`, if passing latents directly, it will not be encoded again.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
+                tensor is generated by sampling using the supplied random `generator`.
            guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            frame_size (`int`, *optional*, default to 64):
-                the width and height of each image frame of the generated 3d output
+                The width and height of each image frame of the generated 3D output.
            output_type (`str`, *optional*, defaults to `"pt"`):
                (`np.array`),`"latent"` (`torch.Tensor`), mesh ([`MeshDecoderOutput`]).
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
+                tuple.

        Examples:

        Returns:
-            [`ShapEPipelineOutput`] or `tuple`
+            [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
        """

        if isinstance(image, PIL.Image.Image):

--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -38,6 +38,21 @@ TARGET_FEATURE_LENGTH = 256


 class SpectrogramDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional audio generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        notes_encoder ([`SpectrogramNotesEncoder`]):
+        continuous_encoder ([`SpectrogramContEncoder`]):
+        decoder ([`T5FilmDecoder`]):
+            A [`T5FilmDecoder`] to denoise the encoded audio latents.
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with `decoder` to denoise the encoded audio latents.
+        melgan ([`OnnxRuntimeModel`]):
+    """
    _optional_components = ["melgan"]

    def __init__(
@@ -127,6 +142,48 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):
                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
                f" {type(callback_steps)}."
            )
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            input_tokens (`List[List[int]]`):
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+            output_type (`str`, *optional*, defaults to `"numpy"`):
+                The output format of the generated audio.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Example:
+
+        ```py
+        >>> from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
+
+        >>> pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
+        >>> pipe = pipe.to("cuda")
+        >>> processor = MidiProcessor()
+
+        >>> # Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
+        >>> output = pipe(processor("beethoven_hammerklavier_2.mid"))
+
+        >>> audio = output.audios[0]
+        ```
+
+        Returns:
+            [`pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """

        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
        full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)

--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -25,11 +25,11 @@ class StableDiffusionPipelineOutput(BaseOutput):

    Args:
        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
    """

    images: Union[List[PIL.Image.Image], np.ndarray]
@@ -116,14 +116,14 @@ if is_transformers_available() and is_flax_available():
    @flax.struct.dataclass
    class FlaxStableDiffusionPipelineOutput(BaseOutput):
        """
-        Output class for Stable Diffusion pipelines.
+        Output class for Flax-based Stable Diffusion pipelines.

        Args:
-            images (`np.ndarray`)
-                Array of shape `(batch_size, height, width, num_channels)` with images from the diffusion pipeline.
-            nsfw_content_detected (`List[bool]`)
-                List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-                (nsfw) content.
+            images (`np.ndarray`):
+                Denoised images of array shape of `(batch_size, height, width, num_channels)`.
+            nsfw_content_detected (`List[bool]`):
+                List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
+                or `None` if safety checking could not be performed.
        """

        images: np.ndarray

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -130,28 +130,27 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
    r"""
    Pipeline for text-guided image to image generation using Stable Diffusion.

-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can only be an
+            instance of [`DDIMScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
    _optional_components = ["safety_checker", "feature_extractor"]

@@ -234,10 +233,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
    def enable_model_cpu_offload(self, gpu_id=0):
        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
+        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
+        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
+        iterative execution of the `unet`.
        """
        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
            from accelerate import cpu_offload_with_hook
@@ -595,71 +594,134 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
        r"""
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded
-                again.
+                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
+                expense of slower inference. This parameter is modulated by `strength`.
            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            source_guidance_scale (`float`, *optional*, defaults to 1):
                Guidance scale for the source prompt. This is useful to control the amount of influence the source
-                prompt for encoding.
+                prompt has for encoding.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.1):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Example:
+
+        ```py
+        import requests
+        import torch
+        from PIL import Image
+        from io import BytesIO
+
+        from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+        # load the pipeline
+        # make sure you're logged in with `huggingface-cli login`
+        model_id_or_path = "CompVis/stable-diffusion-v1-4"
+        scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
+        pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+        # let's download an initial image
+        url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+        response = requests.get(url)
+        init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        init_image = init_image.resize((512, 512))
+        init_image.save("horse.png")
+
+        # let's specify a prompt
+        source_prompt = "An astronaut riding a horse"
+        prompt = "An astronaut riding an elephant"
+
+        # call the pipeline
+        image = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.8,
+            guidance_scale=2,
+            source_guidance_scale=1,
+        ).images[0]
+
+        image.save("horse_to_elephant.png")
+
+        # let's try another example
+        # See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+        url = (
+            "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+        )
+        response = requests.get(url)
+        init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        init_image = init_image.resize((512, 512))
+        init_image.save("black.png")
+
+        source_prompt = "A black colored car"
+        prompt = "A blue colored car"
+
+        # call the pipeline
+        torch.manual_seed(0)
+        image = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.85,
+            guidance_scale=3,
+            source_guidance_scale=1,
+        ).images[0]
+
+        image.save("black_to_blue.png")
+        ```

        Returns:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
        """
        # 1. Check inputs
        self.check_inputs(prompt, strength, callback_steps)