add: utility to format our docs too 📜 (#7314)

* add: utility to format our docs too 📜 * debugging saga * fix: message * checking * should be fixed. * revert pipeline_fixture * remove empty line * make style * fix: setup.py * style.

add: utility to format our docs too 📜 (#7314)
* add: utility to format our docs too 📜 * debugging saga * fix: message * checking * should be fixed. * revert pipeline_fixture * remove empty line * make style * fix: setup.py * style.
4a343077 · Sayak Paul · GitHub · 8e963d1c · 4a343077 · 4a343077
Unverified Commit 4a343077 authored Apr 02, 2024 by Sayak Paul Committed by GitHub Apr 02, 2024
20 changed files
--- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -1000,8 +1000,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
    def fuse_qkv_projections(self):
        """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
-        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        are fused. For cross-attention modules, key and value projection matrices are fused.
        <Tip warning={true}>
@@ -1112,8 +1112,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
        Returns:
            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
-                a `tuple` is returned where the first element is the sample tensor.
+                otherwise a `tuple` is returned where the first element is the sample tensor.
        """
        # By default samples have to be AT least a multiple of the overall upsampling factor.
        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).

--- a/src/diffusers/pipelines/free_init_utils.py
+++ b/src/diffusers/pipelines/free_init_utils.py
@@ -41,20 +41,20 @@ class FreeInitMixin:
            num_iters (`int`, *optional*, defaults to `3`):
                Number of FreeInit noise re-initialization iterations.
            use_fast_sampling (`bool`, *optional*, defaults to `False`):
-                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
+                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the
-                the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
+                "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
            method (`str`, *optional*, defaults to `butterworth`):
-                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
+                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low
-                FreeInit low pass filter.
+                pass filter.
            order (`int`, *optional*, defaults to `4`):
                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
                whereas lower values lead to `gaussian` method behaviour.
            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
-                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
+                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the
-                the original implementation.
+                original implementation.
            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
-                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
+                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the
-                the original implementation.
+                original implementation.
        """
        self._free_init_num_iters = num_iters
        self._free_init_use_fast_sampling = use_fast_sampling

--- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
+++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
@@ -43,10 +43,14 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers import I2VGenXLPipeline
        >>> from diffusers.utils import export_to_gif, load_image
-        >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
+        >>> pipeline = I2VGenXLPipeline.from_pretrained(
+        ...     "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
+        ... )
        >>> pipeline.enable_model_cpu_offload()
-        >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
+        >>> image_url = (
+        ...     "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
+        ... )
        >>> image = load_image(image_url).convert("RGB")
        >>> prompt = "Papers were floating in the air on a table in the library"
@@ -59,7 +63,7 @@ EXAMPLE_DOC_STRING = """
        ...     num_inference_steps=50,
        ...     negative_prompt=negative_prompt,
        ...     guidance_scale=9.0,
-        ...     generator=generator
+        ...     generator=generator,
        ... ).frames[0]
        >>> video_path = export_to_gif(frames, "i2v.gif")
        ```
@@ -95,7 +99,8 @@ class I2VGenXLPipelineOutput(BaseOutput):
     Args:
         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+             denoised
     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
    `(batch_size, num_frames, channels, height, width)`
    """
@@ -551,7 +556,8 @@ class I2VGenXLPipeline(
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            target_fps (`int`, *optional*):
-                Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation.
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. This is also used as a "micro-condition" while generation.
            num_frames (`int`, *optional*):
                The number of video frames to generate.
            num_inference_steps (`int`, *optional*):
@@ -568,9 +574,9 @@ class I2VGenXLPipeline(
            num_videos_per_prompt (`int`, *optional*):
                The number of images to generate per prompt.
            decode_chunk_size (`int`, *optional*):
-                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal
-                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                consistency between frames, but also the higher the memory consumption. By default, the decoder will
-                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+                decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.

--- a/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
+++ b/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
@@ -35,10 +35,10 @@ DYNAMIC_MAP = {
 def convert_state_dict(unet_state_dict):
    """
-    Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
    Args:
-        unet_model (torch.nn.Module): The original U-Net model.
+    Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
-        unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with.
+        unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet
+        model to match keys with.
    Returns:
        OrderedDict: The converted state dictionary.

--- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
+++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
@@ -24,7 +24,9 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers import AutoPipelineForText2Image
        >>> import torch
-        >>> pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe = AutoPipelineForText2Image.from_pretrained(
+        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
+        ... )
        >>> pipe.enable_model_cpu_offload()
        >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."

--- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
+++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
@@ -29,11 +29,15 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers.utils import load_image
        >>> import torch
-        >>> pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained(
+        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
+        ... )
        >>> pipe.enable_model_cpu_offload()
        >>> prompt = "A painting of the inside of a subway train with tiny raccoons."
-        >>> image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
+        ... )
        >>> generator = torch.Generator(device="cpu").manual_seed(0)
        >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]

--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
@@ -73,8 +73,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -749,10 +749,10 @@ class LatentConsistencyModelImg2ImgPipeline(
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
@@ -77,8 +77,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -681,10 +681,10 @@ class LatentConsistencyModelPipeline(
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
+++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
@@ -40,30 +40,21 @@ EXAMPLE_DOC_STRING = """
        >>> from io import BytesIO
        >>> from diffusers import LEditsPPPipelineStableDiffusion
+        >>> from diffusers.utils import load_image
        >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
        ... )
        >>> pipe = pipe.to("cuda")
-        >>> def download_image(url):
-        ...     response = requests.get(url)
-        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
        >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
-        >>> image = download_image(img_url)
+        >>> image = load_image(img_url).convert("RGB")
-        >>> _ = pipe.invert(
+        >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1)
-        ...     image = image,
-        ...     num_inversion_steps=50,
-        ...     skip=0.1
-        ... )
        >>> edited_image = pipe(
-        ...     editing_prompt=["cherry blossom"],
+        ...     editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75
-        ...     edit_guidance_scale=10.0,
+        ... ).images[0]
-        ...     edit_threshold=0.75,
-        ).images[0]
        ```
 """
@@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion(
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
+            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
-            be set to [`DPMSolverMultistepScheduler`].
+            automatically be set to [`DPMSolverMultistepScheduler`].
        safety_checker ([`StableDiffusionSafetyChecker`]):
            Classification module that estimates whether generated images could be considered offensive or harmful.
            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
@@ -531,8 +522,7 @@ class LEditsPPPipelineStableDiffusion(
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            editing_prompt (`str` or `List[str]`, *optional*):
-                Editing prompt(s) to be encoded. If not defined, one has to pass
+                Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead.
-                `editing_prompt_embeds` instead.
            editing_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
@@ -734,8 +724,9 @@ class LEditsPPPipelineStableDiffusion(
        **kwargs,
    ):
        r"""
-        The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`]
+        The call function to the pipeline for editing. The
-        method has to be called beforehand. Edits will always be performed for the last inverted image(s).
+        [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will
+        always be performed for the last inverted image(s).
        Args:
            negative_prompt (`str` or `List[str]`, *optional*):
@@ -748,49 +739,51 @@ class LEditsPPPipelineStableDiffusion(
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a
+                Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain
-                plain tuple.
+                tuple.
            editing_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. The image is reconstructed by setting
-                `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
+                `editing_prompt = None`. Guidance direction of prompt should be specified via
+                `reverse_editing_direction`.
            editing_prompt_embeds (`torch.Tensor>`, *optional*):
-                Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be
+                Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should
-                specified via `reverse_editing_direction`.
+                be specified via `reverse_editing_direction`.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
-                Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
+                Guidance scale for guiding the image generation. If provided as list values should correspond to
-                `edit_guidance_scale` is defined as `s_e` of equation 12 of
+                `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
-                [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+                Paper](https://arxiv.org/abs/2301.12247).
            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
                Number of diffusion steps (for each prompt) for which guidance will not be applied.
            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
                Number of diffusion steps (for each prompt) after which guidance will no longer be applied.
            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
                Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
-                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
+                Paper](https://arxiv.org/abs/2301.12247).
            user_mask (`torch.FloatTensor`, *optional*):
-                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
+                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
-                masks do not meet user preferences.
+                implicit masks do not meet user preferences.
            sem_guidance (`List[torch.Tensor]`, *optional*):
                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
                correspond to `num_inference_steps`.
            use_cross_attn_mask (`bool`, defaults to `False`):
                Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
-                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
+                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
-                [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+                paper](https://arxiv.org/pdf/2311.16711.pdf).
            use_intersect_mask (`bool`, defaults to `True`):
-                Whether the masking term is calculated as intersection of cross-attention masks and masks derived
+                Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
-                from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
+                the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
-                estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+                are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
            attn_store_steps (`List[int]`, *optional*):
                Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
            store_averaged_over_steps (`bool`, defaults to `True`):
-                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
+                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
-                If False, attention maps for each step are stores separately. Just for visualization purposes.
+                False, attention maps for each step are stores separately. Just for visualization purposes.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -815,10 +808,10 @@ class LEditsPPPipelineStableDiffusion(
        Returns:
            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
-            second element is a list of `bool`s denoting whether the corresponding generated image likely represents
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
-            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+            content, according to the `safety_checker`.
        """
        if self.inversion_steps is None:
@@ -1219,9 +1212,9 @@ class LEditsPPPipelineStableDiffusion(
        crops_coords: Optional[Tuple[int, int, int, int]] = None,
    ):
        r"""
-        The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+        The function to the pipeline for image inversion as described by the [LEDITS++
-        If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
+        Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
-        will be performed instead.
+        inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
         Args:
            image (`PipelineImageInput`):
@@ -1238,8 +1231,8 @@ class LEditsPPPipelineStableDiffusion(
                Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
                will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
            generator (`torch.Generator`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
-                inversion deterministic.
+                deterministic.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -1247,23 +1240,24 @@ class LEditsPPPipelineStableDiffusion(
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            height (`int`, *optional*, defaults to `None`):
-                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height.
+                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
+                height.
            width (`int`, *optional*`, defaults to `None`):
-                The width in preprocessed. If `None`, will use  get_default_height_width()` to get the default width.
+                The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
            resize_mode (`str`, *optional*, defaults to `default`):
-                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit
+                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
-                within the specified width and height, and it may not maintaining the original aspect ratio.
+                the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
-                If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
-                within the dimensions, filling empty with data from image.
+                center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
-                If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                image to fit within the specified width and height, maintaining the aspect ratio, and then center the
-                within the dimensions, cropping the excess.
+                image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
-                Note that resize_mode `fill` and `crop` are only supported for PIL image input.
+                supported for PIL image input.
            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
                The crop coordinates for each image in the batch. If `None`, will not crop the image.
        Returns:
-            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
+            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
-            Output will contain the resized input image(s) and respective VAE reconstruction(s).
+            and respective VAE reconstruction(s).
        """
        # Reset attn processor, we do not want to store attn maps during inversion
        self.unet.set_attn_processor(AttnProcessor())

--- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
@@ -85,25 +85,23 @@ EXAMPLE_DOC_STRING = """
        ... )
        >>> pipe = pipe.to("cuda")
        >>> def download_image(url):
        ...     response = requests.get(url)
        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
        >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
        >>> image = download_image(img_url)
-        >>> _ = pipe.invert(
+        >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2)
-        ...     image = image,
-        ...     num_inversion_steps=50,
-        ...     skip=0.2
-        ... )
        >>> edited_image = pipe(
-        ...     editing_prompt=["tennis ball","tomato"],
+        ...     editing_prompt=["tennis ball", "tomato"],
-        ...     reverse_editing_direction=[True,False],
+        ...     reverse_editing_direction=[True, False],
-        ...     edit_guidance_scale=[5.0,10.0],
+        ...     edit_guidance_scale=[5.0, 10.0],
-        ...     edit_threshold=[0.9,0.85],
+        ...     edit_threshold=[0.9, 0.85],
-        ).images[0]
+        ... ).images[0]
        ```
 """
@@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL(
    """
    Pipeline for textual image editing using LEDits++ with Stable Diffusion XL.
-    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the superclass
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the
-    documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
+    superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a
-    device, etc.).
+    particular device, etc.).
    In addition the pipeline inherits the following loading methods:
        - *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`]
@@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL(
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
+            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
-            be set to [`DPMSolverMultistepScheduler`].
+            automatically be set to [`DPMSolverMultistepScheduler`].
        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
            `stabilityai/stable-diffusion-xl-base-1-0`.
@@ -453,9 +451,9 @@ class LEditsPPPipelineStableDiffusionXL(
                Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass
                `editing_prompt_embeds` instead.
            editing_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input
+                If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from
-                argument.
+                `editing_prompt` input argument.
            editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt`
@@ -835,8 +833,9 @@ class LEditsPPPipelineStableDiffusionXL(
        **kwargs,
    ):
        r"""
-        The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`]
+        The call function to the pipeline for editing. The
-        method has to be called beforehand. Edits will always be performed for the last inverted image(s).
+        [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits
+        will always be performed for the last inverted image(s).
        Args:
            denoising_end (`float`, *optional*):
@@ -894,11 +893,11 @@ class LEditsPPPipelineStableDiffusionXL(
                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            editing_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. The image is reconstructed by setting
-                `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
+                `editing_prompt = None`. Guidance direction of prompt should be specified via
+                `reverse_editing_direction`.
            editing_prompt_embeddings (`torch.Tensor`, *optional*):
-                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
+                If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument.
-                argument.
            editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*):
                Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
@@ -906,35 +905,36 @@ class LEditsPPPipelineStableDiffusionXL(
            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
-                Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
+                Guidance scale for guiding the image generation. If provided as list values should correspond to
-                `edit_guidance_scale` is defined as `s_e` of equation 12 of
+                `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
-                [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+                Paper](https://arxiv.org/abs/2301.12247).
            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
                Number of diffusion steps (for each prompt) for which guidance is not applied.
            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
                Number of diffusion steps (for each prompt) after which guidance is no longer applied.
            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
                Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
-                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
+                Paper](https://arxiv.org/abs/2301.12247).
            sem_guidance (`List[torch.Tensor]`, *optional*):
                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
                correspond to `num_inference_steps`.
            use_cross_attn_mask:
                Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
-                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
+                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
-                [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+                paper](https://arxiv.org/pdf/2311.16711.pdf).
            use_intersect_mask:
-                Whether the masking term is calculated as intersection of cross-attention masks and masks derived
+                Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
-                from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
+                the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
-                estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+                are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
            user_mask:
-                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
+                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
-                masks do not meet user preferences.
+                implicit masks do not meet user preferences.
            attn_store_steps:
                Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
            store_averaged_over_steps:
-                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
+                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
-                If False, attention maps for each step are stores separately. Just for visualization purposes.
+                False, attention maps for each step are stores separately. Just for visualization purposes.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
@@ -952,8 +952,8 @@ class LEditsPPPipelineStableDiffusionXL(
        Returns:
            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+            returning a tuple, the first element is a list with the generated images.
        """
        if self.inversion_steps is None:
            raise ValueError(
@@ -1446,9 +1446,9 @@ class LEditsPPPipelineStableDiffusionXL(
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
        r"""
-        The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+        The function to the pipeline for image inversion as described by the [LEDITS++
-        If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
+        Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
-        will be performed instead.
+        inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
         Args:
            image (`PipelineImageInput`):
@@ -1472,8 +1472,8 @@ class LEditsPPPipelineStableDiffusionXL(
                Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
                will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
            generator (`torch.Generator`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
-                inversion deterministic.
+                deterministic.
            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
@@ -1488,8 +1488,8 @@ class LEditsPPPipelineStableDiffusionXL(
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
        Returns:
-            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
+            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
-            Output will contain the resized input image(s) and respective VAE reconstruction(s).
+            and respective VAE reconstruction(s).
        """
        # Reset attn processor, we do not want to store attn maps during inversion

--- a/src/diffusers/pipelines/ledits_pp/pipeline_output.py
+++ b/src/diffusers/pipelines/ledits_pp/pipeline_output.py
@@ -35,8 +35,8 @@ class LEditsPPInversionPipelineOutput(BaseOutput):
            List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape `
            (batch_size, height, width, num_channels)`.
        vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape `
+            List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape
-            (batch_size, height, width, num_channels)`.
+            ` (batch_size, height, width, num_channels)`.
    """
    images: Union[List[PIL.Image.Image], np.ndarray]

--- a/src/diffusers/pipelines/pia/pipeline_pia.py
+++ b/src/diffusers/pipelines/pia/pipeline_pia.py
@@ -59,6 +59,7 @@ EXAMPLE_DOC_STRING = """
        ...     PIAPipeline,
        ... )
        >>> from diffusers.utils import export_to_gif, load_image
        >>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers")
        >>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter)
        >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
@@ -135,9 +136,9 @@ class PIAPipelineOutput(BaseOutput):
    Args:
        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
+            Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of
-            NumPy array of shape `(batch_size, num_frames, channels, height, width,
+            shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames,
-            Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
+            channels, height, width)`.
    """
    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
@@ -759,16 +760,15 @@ class PIAPipeline(
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            motion_scale: (`int`, *optional*, defaults to 0):
-                Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific
+                Parameter that controls the amount and type of motion that is added to the image. Increasing the value
-                ranges of values control the type of motion that is added. Must be between 0 and 8.
+                increases the amount of motion, while specific ranges of values control the type of motion that is
-                Set between 0-2 to only increase the amount of motion.
+                added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5
-                Set between 3-5 to create looping motion.
+                to create looping motion. Set between 6-8 to perform motion with image style transfer.
-                Set between 6-8 to perform motion with image style transfer.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                `np.array`.
@@ -795,8 +795,8 @@ class PIAPipeline(
        Returns:
            [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is
+                If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is returned, otherwise a
-                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+                `tuple` is returned where the first element is a list with the generated frames.
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor

--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -538,7 +538,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                allowed by Git.
            custom_revision (`str`, *optional*):
                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
-                `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version.
+                `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers
+                version.
            mirror (`str`, *optional*):
                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
@@ -1669,7 +1670,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
    @classmethod
    def from_pipe(cls, pipeline, **kwargs):
        r"""
-        Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing pipeline components without reallocating additional memory.
+        Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing
+        pipeline components without reallocating additional memory.
        Arguments:
            pipeline (`DiffusionPipeline`):
@@ -1851,8 +1853,8 @@ class StableDiffusionMixin:
    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
        """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
-        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        are fused. For cross-attention modules, key and value projection matrices are fused.
        <Tip warning={true}>

--- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
@@ -186,8 +186,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):

--- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
+++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
@@ -334,8 +334,8 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
                argument.
            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
-                argument.
+                input argument.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):

--- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
+++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
@@ -31,7 +31,10 @@ TEXT2IMAGE_EXAMPLE_DOC_STRING = """
        ```py
        >>> import torch
        >>> from diffusers import StableCascadeCombinedPipeline
-        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16)
+        >>> pipe = StableCascadeCombinedPipeline.from_pretrained(
+        ...     "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
+        ... )
        >>> pipe.enable_model_cpu_offload()
        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> images = pipe(prompt=prompt)

--- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
+++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
@@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline):
        prior ([`StableCascadeUNet`]):
            The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+            Frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
        image_encoder ([`CLIPVisionModelWithProjection`]):
@@ -420,11 +421,11 @@ class StableCascadePriorPipeline(DiffusionPipeline):
                argument.
            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
-                argument.
+                input argument.
            image_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting.
+                Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If
-                If not provided, image embeddings will be generated from `image` input argument if existing.
+                not provided, image embeddings will be generated from `image` input argument if existing.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -452,9 +453,9 @@ class StableCascadePriorPipeline(DiffusionPipeline):
        Examples:
        Returns:
-            [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if
+            [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
-            generated image embeddings.
+            embeddings.
        """
        # 0. Define commonly used variables

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -85,8 +85,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -801,10 +801,10 @@ class StableDiffusionPipeline(
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -125,8 +125,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -897,10 +897,10 @@ class StableDiffusionImg2ImgPipeline(
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -189,8 +189,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -1022,11 +1022,12 @@ class StableDiffusionInpaintPipeline(
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            padding_mask_crop (`int`, *optional*, defaults to `None`):
-                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
-                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
-                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
-                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
-                and contain information irrelevant for inpainting, such as background.
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
            strength (`float`, *optional*, defaults to 1.0):
                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
@@ -1066,10 +1067,10 @@ class StableDiffusionInpaintPipeline(
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):