[docs] Clean up pipeline apis (#3905)

* start with stable diffusion * fix * finish stable diffusion pipelines * fix path to pipeline output * fix flax paths * fix copies * add up to score sde ve * finish first pass of pipelines * fix copies * second review * align doc titles * more review fixes * final review

[docs] Clean up pipeline apis (#3905)
* start with stable diffusion * fix * finish stable diffusion pipelines * fix path to pipeline output * fix flax paths * fix copies * add up to score sde ve * finish first pass of pipelines * fix copies * second review * align doc titles * more review fixes * final review
a69754bb · Steven Liu · GitHub · bcc570b9 · a69754bb · a69754bb
Unverified Commit a69754bb authored Jul 21, 2023 by Steven Liu Committed by GitHub Jul 21, 2023
20 changed files
--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -26,13 +26,16 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

 class DanceDiffusionPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for audio generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
-        unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded audio.
+        unet ([`UNet1DModel`]):
+            A `UNet1DModel` to denoise the encoded audio.
        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded audio. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
            [`IPNDMScheduler`].
    """

@@ -50,6 +53,8 @@ class DanceDiffusionPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[AudioPipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                The number of audio samples to generate.
@@ -57,17 +62,40 @@ class DanceDiffusionPipeline(DiffusionPipeline):
                The number of denoising steps. More denoising steps usually lead to a higher-quality audio sample at
                the expense of slower inference.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
-                The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.*
-                `sample_size`, will be `audio_length_in_s` * `self.unet.config.sample_rate`.
+                The length of the generated audio sample in seconds.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.

+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        from scipy.io.wavfile import write
+
+        model_id = "harmonai/maestro-150k"
+        pipe = DiffusionPipeline.from_pretrained(model_id)
+        pipe = pipe.to("cuda")
+
+        audios = pipe(audio_length_in_s=4.0).audios
+
+        # To save locally
+        for i, audio in enumerate(audios):
+            write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
+
+        # To dislay in google colab
+        import IPython.display as ipd
+
+        for audio in audios:
+            display(ipd.Audio(audio, rate=pipe.unet.sample_rate))
+        ```
+
        Returns:
-            [`~pipelines.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated audio.
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
        """

        if audio_length_in_s is None:

--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class DDIMPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
            [`DDPMScheduler`], or [`DDIMScheduler`].
@@ -53,29 +56,56 @@ class DDIMPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                The number of images to generate.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to
+                DDIM and `1` corresponds to DDPM.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            use_clipped_model_output (`bool`, *optional*, defaults to `None`):
-                if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed
-                downstream to the scheduler. So use `None` for schedulers which don't support this argument.
+                If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed
+                downstream to the scheduler (use `None` for schedulers which don't support this argument).
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

+        Example:
+
+        ```py
+        >>> from diffusers import DDIMPipeline
+        >>> import PIL.Image
+        >>> import numpy as np
+
+        >>> # load model and scheduler
+        >>> pipe = DDIMPipeline.from_pretrained("fusing/ddim-lsun-bedroom")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe(eta=0.0, num_inference_steps=50)
+
+        >>> # process image to PIL
+        >>> image_processed = image.cpu().permute(0, 2, 3, 1)
+        >>> image_processed = (image_processed + 1.0) * 127.5
+        >>> image_processed = image_processed.numpy().astype(np.uint8)
+        >>> image_pil = PIL.Image.fromarray(image_processed[0])
+
+        >>> # save image
+        >>> image_pil.save("test.png")
+        ```
+
        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """

        # Sample gaussian noise to begin loop

--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -23,11 +23,14 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class DDPMPipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
            [`DDPMScheduler`], or [`DDIMScheduler`].
@@ -47,24 +50,41 @@ class DDPMPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            batch_size (`int`, *optional*, defaults to 1):
                The number of images to generate.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            num_inference_steps (`int`, *optional*, defaults to 1000):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

+        Example:
+
+        ```py
+        >>> from diffusers import DDPMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+
+        >>> # save image
+        >>> image.save("ddpm_generated_image.png")
+        ```
+
        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """
        # Sample gaussian noise to begin loop
        if isinstance(self.unet.config.sample_size, int):

--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -30,16 +30,18 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class DiTPipeline(DiffusionPipeline):
    r"""
-    This pipeline inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for image generation based on a Transformer backbone instead of a UNet.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        transformer ([`Transformer2DModel`]):
-            Class conditioned Transformer in Diffusion model to denoise the encoded image latents.
+            A class conditioned `Transformer2DModel` to denoise the encoded image latents.
        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        scheduler ([`DDIMScheduler`]):
-            A scheduler to be used in combination with `dit` to denoise the encoded image latents.
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    """

    def __init__(
@@ -63,13 +65,15 @@ class DiTPipeline(DiffusionPipeline):
    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
        r"""

-        Map label strings, *e.g.* from ImageNet, to corresponding class ids.
+        Map label strings from ImageNet to corresponding class ids.

        Parameters:
-            label (`str` or `dict` of `str`): label strings to be mapped to class ids.
+            label (`str` or `dict` of `str`):
+                Label strings to be mapped to class ids.

        Returns:
-            `list` of `int`: Class ids to be processed by pipeline.
+            `list` of `int`:
+                Class ids to be processed by pipeline.
        """

        if not isinstance(label, list):
@@ -94,24 +98,53 @@ class DiTPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.

        Args:
            class_labels (List[int]):
-                List of imagenet class labels for the images to be generated.
+                List of ImageNet class labels for the images to be generated.
            guidance_scale (`float`, *optional*, defaults to 4.0):
-                Scale of the guidance signal.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            num_inference_steps (`int`, *optional*, defaults to 250):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
+        >>> import torch
+
+        >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # pick words from Imagenet class labels
+        >>> pipe.labels  # to print all available words
+
+        >>> # pick words that exist in ImageNet
+        >>> words = ["white shark", "umbrella"]
+
+        >>> class_ids = pipe.get_label_ids(words)
+
+        >>> generator = torch.manual_seed(33)
+        >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
+
+        >>> image = output.images[0]  # label 'white shark'
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """

        batch_size = len(class_labels)

--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -31,18 +31,20 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput

 class LDMTextToImagePipeline(DiffusionPipeline):
    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for text-to-image generation using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
        bert ([`LDMBertModel`]):
-            Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture.
-        tokenizer (`transformers.BertTokenizer`):
-            Tokenizer of class
-            [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
@@ -76,38 +78,54 @@ class LDMTextToImagePipeline(DiffusionPipeline):
        **kwargs,
    ) -> Union[Tuple, ImagePipelineOutput]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 1.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt` at
-                the, usually at the expense of lower image quality.
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
+                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DiffusionPipeline
+
+        >>> # load model and scheduler
+        >>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> prompt = "A painting of a squirrel eating a burger"
+        >>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images
+
+        >>> # save images
+        >>> for idx, image in enumerate(images):
+        ...     image.save(f"squirrel-{idx}.png")
+        ```

        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor

--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -31,15 +31,16 @@ def preprocess(image):

 class LDMSuperResolutionPipeline(DiffusionPipeline):
    r"""
-    A pipeline for image super-resolution using Latent
+    A pipeline for image super-resolution using latent diffusion.

-    This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations.
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
@@ -74,30 +75,58 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
        return_dict: bool = True,
    ) -> Union[Tuple, ImagePipelineOutput]:
        r"""
+        The call function to the pipeline for generation.
+
        Args:
            image (`torch.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
+                `Image` or tensor representing an image batch to be used as the starting point for the process.
            batch_size (`int`, *optional*, defaults to 1):
                Number of images to generate.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> import requests
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> from diffusers import LDMSuperResolutionPipeline
+        >>> import torch
+
+        >>> # load model and scheduler
+        >>> pipeline = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
+        >>> pipeline = pipeline.to("cuda")
+
+        >>> # let's download an  image
+        >>> url = (
+        ...     "https://user-images.githubusercontent.com/38061659/199705896-b48e17b8-b231-47cd-a270-4ffa5a93fa3e.png"
+        ... )
+        >>> response = requests.get(url)
+        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> low_res_img = low_res_img.resize((128, 128))
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> upscaled_image = pipeline(low_res_img, num_inference_steps=100, eta=1).images[0]
+        >>> # save image
+        >>> upscaled_image.save("ldm_generated_image.png")
+        ```

        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
        """
        if isinstance(image, PIL.Image.Image):
            batch_size = 1

--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
--- a/src/diffusers/pipelines/pndm/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
--- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
--- a/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -16,11 +16,11 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput):

    Args:
        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
    """

    images: Union[List[PIL.Image.Image], np.ndarray]

--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py